/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Tue Oct 14 04:32:49 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.1: +44 -11 lines
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 04:28:43 -0000
	* Tokenizer.pm.src: Make *_TOKEN (token type constants)
	exportable.  New token types, PI_TOKEN for XML and ABORT_TOKEN for
	document.write() or incremental parsing, are added for future
	extensions.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 04:27:29 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile, Parser.pm.src: New files.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.2 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178    
179     $self->{state} = DATA_STATE; # MUST
180     #$self->{s_kwd}; # state keyword - initialized when used
181     #$self->{entity__value}; # initialized when used
182     #$self->{entity__match}; # initialized when used
183     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
184     undef $self->{ct}; # current token
185     undef $self->{ca}; # current attribute
186     undef $self->{last_stag_name}; # last emitted start tag name
187     #$self->{prev_state}; # initialized when used
188     delete $self->{self_closing};
189     $self->{char_buffer} = '';
190     $self->{char_buffer_pos} = 0;
191     $self->{nc} = -1; # next input character
192     #$self->{next_nc}
193    
194     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
195     $self->{line_prev} = $self->{line};
196     $self->{column_prev} = $self->{column};
197     $self->{column}++;
198     $self->{nc}
199     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
200     } else {
201     $self->{set_nc}->($self);
202     }
203    
204     $self->{token} = [];
205     # $self->{escape}
206     } # _initialize_tokenizer
207    
208     ## A token has:
209     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
210     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
211     ## ->{name} (DOCTYPE_TOKEN)
212     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
213     ## ->{pubid} (DOCTYPE_TOKEN)
214     ## ->{sysid} (DOCTYPE_TOKEN)
215     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
216     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
217     ## ->{name}
218     ## ->{value}
219     ## ->{has_reference} == 1 or 0
220     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
221     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
223     ## while the token is pushed back to the stack.
224    
225     ## Emitted token MUST immediately be handled by the tree construction state.
226    
227     ## Before each step, UA MAY check to see if either one of the scripts in
228     ## "list of scripts that will execute as soon as possible" or the first
229     ## script in the "list of scripts that will execute asynchronously",
230     ## has completed loading. If one has, then it MUST be executed
231     ## and removed from the list.
232    
233     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
234     ## (This requirement was dropped from HTML5 spec, unfortunately.)
235    
236     my $is_space = {
237     0x0009 => 1, # CHARACTER TABULATION (HT)
238     0x000A => 1, # LINE FEED (LF)
239     #0x000B => 0, # LINE TABULATION (VT)
240     0x000C => 1, # FORM FEED (FF)
241     #0x000D => 1, # CARRIAGE RETURN (CR)
242     0x0020 => 1, # SPACE (SP)
243     };
244    
245     sub _get_next_token ($) {
246     my $self = shift;
247    
248     if ($self->{self_closing}) {
249     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
250     ## NOTE: The |self_closing| flag is only set by start tag token.
251     ## In addition, when a start tag token is emitted, it is always set to
252     ## |ct|.
253     delete $self->{self_closing};
254     }
255    
256     if (@{$self->{token}}) {
257     $self->{self_closing} = $self->{token}->[0]->{self_closing};
258     return shift @{$self->{token}};
259     }
260    
261     A: {
262     if ($self->{state} == PCDATA_STATE) {
263     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
264    
265     if ($self->{nc} == 0x0026) { # &
266    
267     ## NOTE: In the spec, the tokenizer is switched to the
268     ## "entity data state". In this implementation, the tokenizer
269     ## is switched to the |ENTITY_STATE|, which is an implementation
270     ## of the "consume a character reference" algorithm.
271     $self->{entity_add} = -1;
272     $self->{prev_state} = DATA_STATE;
273     $self->{state} = ENTITY_STATE;
274    
275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
276     $self->{line_prev} = $self->{line};
277     $self->{column_prev} = $self->{column};
278     $self->{column}++;
279     $self->{nc}
280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
281     } else {
282     $self->{set_nc}->($self);
283     }
284    
285     redo A;
286     } elsif ($self->{nc} == 0x003C) { # <
287    
288     $self->{state} = TAG_OPEN_STATE;
289    
290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
291     $self->{line_prev} = $self->{line};
292     $self->{column_prev} = $self->{column};
293     $self->{column}++;
294     $self->{nc}
295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
296     } else {
297     $self->{set_nc}->($self);
298     }
299    
300     redo A;
301     } elsif ($self->{nc} == -1) {
302    
303     return ({type => END_OF_FILE_TOKEN,
304     line => $self->{line}, column => $self->{column}});
305     last A; ## TODO: ok?
306     } else {
307    
308     #
309     }
310    
311     # Anything else
312     my $token = {type => CHARACTER_TOKEN,
313     data => chr $self->{nc},
314     line => $self->{line}, column => $self->{column},
315     };
316     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
317    
318     ## Stay in the state.
319    
320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
321     $self->{line_prev} = $self->{line};
322     $self->{column_prev} = $self->{column};
323     $self->{column}++;
324     $self->{nc}
325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
326     } else {
327     $self->{set_nc}->($self);
328     }
329    
330     return ($token);
331     redo A;
332     } elsif ($self->{state} == DATA_STATE) {
333     $self->{s_kwd} = '' unless defined $self->{s_kwd};
334     if ($self->{nc} == 0x0026) { # &
335     $self->{s_kwd} = '';
336     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
337     not $self->{escape}) {
338    
339     ## NOTE: In the spec, the tokenizer is switched to the
340     ## "entity data state". In this implementation, the tokenizer
341     ## is switched to the |ENTITY_STATE|, which is an implementation
342     ## of the "consume a character reference" algorithm.
343     $self->{entity_add} = -1;
344     $self->{prev_state} = DATA_STATE;
345     $self->{state} = ENTITY_STATE;
346    
347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
348     $self->{line_prev} = $self->{line};
349     $self->{column_prev} = $self->{column};
350     $self->{column}++;
351     $self->{nc}
352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
353     } else {
354     $self->{set_nc}->($self);
355     }
356    
357     redo A;
358     } else {
359    
360     #
361     }
362     } elsif ($self->{nc} == 0x002D) { # -
363     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
364     $self->{s_kwd} .= '-';
365    
366     if ($self->{s_kwd} eq '<!--') {
367    
368     $self->{escape} = 1; # unless $self->{escape};
369     $self->{s_kwd} = '--';
370     #
371     } elsif ($self->{s_kwd} eq '---') {
372    
373     $self->{s_kwd} = '--';
374     #
375     } else {
376    
377     #
378     }
379     }
380    
381     #
382     } elsif ($self->{nc} == 0x0021) { # !
383     if (length $self->{s_kwd}) {
384    
385     $self->{s_kwd} .= '!';
386     #
387     } else {
388    
389     #$self->{s_kwd} = '';
390     #
391     }
392     #
393     } elsif ($self->{nc} == 0x003C) { # <
394     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
395     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
396     not $self->{escape})) {
397    
398     $self->{state} = TAG_OPEN_STATE;
399    
400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
401     $self->{line_prev} = $self->{line};
402     $self->{column_prev} = $self->{column};
403     $self->{column}++;
404     $self->{nc}
405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
406     } else {
407     $self->{set_nc}->($self);
408     }
409    
410     redo A;
411     } else {
412    
413     $self->{s_kwd} = '';
414     #
415     }
416     } elsif ($self->{nc} == 0x003E) { # >
417     if ($self->{escape} and
418     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
419     if ($self->{s_kwd} eq '--') {
420    
421     delete $self->{escape};
422     } else {
423    
424     }
425     } else {
426    
427     }
428    
429     $self->{s_kwd} = '';
430     #
431     } elsif ($self->{nc} == -1) {
432    
433     $self->{s_kwd} = '';
434     return ({type => END_OF_FILE_TOKEN,
435     line => $self->{line}, column => $self->{column}});
436     last A; ## TODO: ok?
437     } else {
438    
439     $self->{s_kwd} = '';
440     #
441     }
442    
443     # Anything else
444     my $token = {type => CHARACTER_TOKEN,
445     data => chr $self->{nc},
446     line => $self->{line}, column => $self->{column},
447     };
448     if ($self->{read_until}->($token->{data}, q[-!<>&],
449     length $token->{data})) {
450     $self->{s_kwd} = '';
451     }
452    
453     ## Stay in the data state.
454     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
455    
456     $self->{state} = PCDATA_STATE;
457     } else {
458    
459     ## Stay in the state.
460     }
461    
462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
463     $self->{line_prev} = $self->{line};
464     $self->{column_prev} = $self->{column};
465     $self->{column}++;
466     $self->{nc}
467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
468     } else {
469     $self->{set_nc}->($self);
470     }
471    
472     return ($token);
473     redo A;
474     } elsif ($self->{state} == TAG_OPEN_STATE) {
475     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
476     if ($self->{nc} == 0x002F) { # /
477    
478    
479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
480     $self->{line_prev} = $self->{line};
481     $self->{column_prev} = $self->{column};
482     $self->{column}++;
483     $self->{nc}
484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
485     } else {
486     $self->{set_nc}->($self);
487     }
488    
489     $self->{state} = CLOSE_TAG_OPEN_STATE;
490     redo A;
491     } elsif ($self->{nc} == 0x0021) { # !
492    
493     $self->{s_kwd} = '<' unless $self->{escape};
494     #
495     } else {
496    
497     #
498     }
499    
500     ## reconsume
501     $self->{state} = DATA_STATE;
502     return ({type => CHARACTER_TOKEN, data => '<',
503     line => $self->{line_prev},
504     column => $self->{column_prev},
505     });
506     redo A;
507     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
508     if ($self->{nc} == 0x0021) { # !
509    
510     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
511    
512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
513     $self->{line_prev} = $self->{line};
514     $self->{column_prev} = $self->{column};
515     $self->{column}++;
516     $self->{nc}
517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
518     } else {
519     $self->{set_nc}->($self);
520     }
521    
522     redo A;
523     } elsif ($self->{nc} == 0x002F) { # /
524    
525     $self->{state} = CLOSE_TAG_OPEN_STATE;
526    
527     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
528     $self->{line_prev} = $self->{line};
529     $self->{column_prev} = $self->{column};
530     $self->{column}++;
531     $self->{nc}
532     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
533     } else {
534     $self->{set_nc}->($self);
535     }
536    
537     redo A;
538     } elsif (0x0041 <= $self->{nc} and
539     $self->{nc} <= 0x005A) { # A..Z
540    
541     $self->{ct}
542     = {type => START_TAG_TOKEN,
543     tag_name => chr ($self->{nc} + 0x0020),
544     line => $self->{line_prev},
545     column => $self->{column_prev}};
546     $self->{state} = TAG_NAME_STATE;
547    
548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
549     $self->{line_prev} = $self->{line};
550     $self->{column_prev} = $self->{column};
551     $self->{column}++;
552     $self->{nc}
553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
554     } else {
555     $self->{set_nc}->($self);
556     }
557    
558     redo A;
559     } elsif (0x0061 <= $self->{nc} and
560     $self->{nc} <= 0x007A) { # a..z
561    
562     $self->{ct} = {type => START_TAG_TOKEN,
563     tag_name => chr ($self->{nc}),
564     line => $self->{line_prev},
565     column => $self->{column_prev}};
566     $self->{state} = TAG_NAME_STATE;
567    
568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
569     $self->{line_prev} = $self->{line};
570     $self->{column_prev} = $self->{column};
571     $self->{column}++;
572     $self->{nc}
573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
574     } else {
575     $self->{set_nc}->($self);
576     }
577    
578     redo A;
579     } elsif ($self->{nc} == 0x003E) { # >
580    
581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
582     line => $self->{line_prev},
583     column => $self->{column_prev});
584     $self->{state} = DATA_STATE;
585    
586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
587     $self->{line_prev} = $self->{line};
588     $self->{column_prev} = $self->{column};
589     $self->{column}++;
590     $self->{nc}
591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
592     } else {
593     $self->{set_nc}->($self);
594     }
595    
596    
597     return ({type => CHARACTER_TOKEN, data => '<>',
598     line => $self->{line_prev},
599     column => $self->{column_prev},
600     });
601    
602     redo A;
603     } elsif ($self->{nc} == 0x003F) { # ?
604    
605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
606     line => $self->{line_prev},
607     column => $self->{column_prev});
608     $self->{state} = BOGUS_COMMENT_STATE;
609     $self->{ct} = {type => COMMENT_TOKEN, data => '',
610     line => $self->{line_prev},
611     column => $self->{column_prev},
612     };
613     ## $self->{nc} is intentionally left as is
614     redo A;
615     } else {
616    
617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
618     line => $self->{line_prev},
619     column => $self->{column_prev});
620     $self->{state} = DATA_STATE;
621     ## reconsume
622    
623     return ({type => CHARACTER_TOKEN, data => '<',
624     line => $self->{line_prev},
625     column => $self->{column_prev},
626     });
627    
628     redo A;
629     }
630     } else {
631     die "$0: $self->{content_model} in tag open";
632     }
633     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
634     ## NOTE: The "close tag open state" in the spec is implemented as
635     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
636    
637     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
638     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
639     if (defined $self->{last_stag_name}) {
640     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
641     $self->{s_kwd} = '';
642     ## Reconsume.
643     redo A;
644     } else {
645     ## No start tag token has ever been emitted
646     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
647    
648     $self->{state} = DATA_STATE;
649     ## Reconsume.
650     return ({type => CHARACTER_TOKEN, data => '</',
651     line => $l, column => $c,
652     });
653     redo A;
654     }
655     }
656    
657     if (0x0041 <= $self->{nc} and
658     $self->{nc} <= 0x005A) { # A..Z
659    
660     $self->{ct}
661     = {type => END_TAG_TOKEN,
662     tag_name => chr ($self->{nc} + 0x0020),
663     line => $l, column => $c};
664     $self->{state} = TAG_NAME_STATE;
665    
666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
667     $self->{line_prev} = $self->{line};
668     $self->{column_prev} = $self->{column};
669     $self->{column}++;
670     $self->{nc}
671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
672     } else {
673     $self->{set_nc}->($self);
674     }
675    
676     redo A;
677     } elsif (0x0061 <= $self->{nc} and
678     $self->{nc} <= 0x007A) { # a..z
679    
680     $self->{ct} = {type => END_TAG_TOKEN,
681     tag_name => chr ($self->{nc}),
682     line => $l, column => $c};
683     $self->{state} = TAG_NAME_STATE;
684    
685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686     $self->{line_prev} = $self->{line};
687     $self->{column_prev} = $self->{column};
688     $self->{column}++;
689     $self->{nc}
690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691     } else {
692     $self->{set_nc}->($self);
693     }
694    
695     redo A;
696     } elsif ($self->{nc} == 0x003E) { # >
697    
698     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
699     line => $self->{line_prev}, ## "<" in "</>"
700     column => $self->{column_prev} - 1);
701     $self->{state} = DATA_STATE;
702    
703     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
704     $self->{line_prev} = $self->{line};
705     $self->{column_prev} = $self->{column};
706     $self->{column}++;
707     $self->{nc}
708     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
709     } else {
710     $self->{set_nc}->($self);
711     }
712    
713     redo A;
714     } elsif ($self->{nc} == -1) {
715    
716     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
717     $self->{state} = DATA_STATE;
718     # reconsume
719    
720     return ({type => CHARACTER_TOKEN, data => '</',
721     line => $l, column => $c,
722     });
723    
724     redo A;
725     } else {
726    
727     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
728     $self->{state} = BOGUS_COMMENT_STATE;
729     $self->{ct} = {type => COMMENT_TOKEN, data => '',
730     line => $self->{line_prev}, # "<" of "</"
731     column => $self->{column_prev} - 1,
732     };
733     ## NOTE: $self->{nc} is intentionally left as is.
734     ## Although the "anything else" case of the spec not explicitly
735     ## states that the next input character is to be reconsumed,
736     ## it will be included to the |data| of the comment token
737     ## generated from the bogus end tag, as defined in the
738     ## "bogus comment state" entry.
739     redo A;
740     }
741     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
742     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
743     if (length $ch) {
744     my $CH = $ch;
745     $ch =~ tr/a-z/A-Z/;
746     my $nch = chr $self->{nc};
747     if ($nch eq $ch or $nch eq $CH) {
748    
749     ## Stay in the state.
750     $self->{s_kwd} .= $nch;
751    
752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
753     $self->{line_prev} = $self->{line};
754     $self->{column_prev} = $self->{column};
755     $self->{column}++;
756     $self->{nc}
757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
758     } else {
759     $self->{set_nc}->($self);
760     }
761    
762     redo A;
763     } else {
764    
765     $self->{state} = DATA_STATE;
766     ## Reconsume.
767     return ({type => CHARACTER_TOKEN,
768     data => '</' . $self->{s_kwd},
769     line => $self->{line_prev},
770     column => $self->{column_prev} - 1 - length $self->{s_kwd},
771     });
772     redo A;
773     }
774     } else { # after "<{tag-name}"
775     unless ($is_space->{$self->{nc}} or
776     {
777     0x003E => 1, # >
778     0x002F => 1, # /
779     -1 => 1, # EOF
780     }->{$self->{nc}}) {
781    
782     ## Reconsume.
783     $self->{state} = DATA_STATE;
784     return ({type => CHARACTER_TOKEN,
785     data => '</' . $self->{s_kwd},
786     line => $self->{line_prev},
787     column => $self->{column_prev} - 1 - length $self->{s_kwd},
788     });
789     redo A;
790     } else {
791    
792     $self->{ct}
793     = {type => END_TAG_TOKEN,
794     tag_name => $self->{last_stag_name},
795     line => $self->{line_prev},
796     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
797     $self->{state} = TAG_NAME_STATE;
798     ## Reconsume.
799     redo A;
800     }
801     }
802     } elsif ($self->{state} == TAG_NAME_STATE) {
803     if ($is_space->{$self->{nc}}) {
804    
805     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
806    
807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
808     $self->{line_prev} = $self->{line};
809     $self->{column_prev} = $self->{column};
810     $self->{column}++;
811     $self->{nc}
812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
813     } else {
814     $self->{set_nc}->($self);
815     }
816    
817     redo A;
818     } elsif ($self->{nc} == 0x003E) { # >
819     if ($self->{ct}->{type} == START_TAG_TOKEN) {
820    
821     $self->{last_stag_name} = $self->{ct}->{tag_name};
822     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
823     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
824     #if ($self->{ct}->{attributes}) {
825     # ## NOTE: This should never be reached.
826     # !!! cp (36);
827     # !!! parse-error (type => 'end tag attribute');
828     #} else {
829    
830     #}
831     } else {
832     die "$0: $self->{ct}->{type}: Unknown token type";
833     }
834     $self->{state} = DATA_STATE;
835    
836     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
837     $self->{line_prev} = $self->{line};
838     $self->{column_prev} = $self->{column};
839     $self->{column}++;
840     $self->{nc}
841     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
842     } else {
843     $self->{set_nc}->($self);
844     }
845    
846    
847     return ($self->{ct}); # start tag or end tag
848    
849     redo A;
850     } elsif (0x0041 <= $self->{nc} and
851     $self->{nc} <= 0x005A) { # A..Z
852    
853     $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
854     # start tag or end tag
855     ## Stay in this state
856    
857     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
858     $self->{line_prev} = $self->{line};
859     $self->{column_prev} = $self->{column};
860     $self->{column}++;
861     $self->{nc}
862     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
863     } else {
864     $self->{set_nc}->($self);
865     }
866    
867     redo A;
868     } elsif ($self->{nc} == -1) {
869     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
870     if ($self->{ct}->{type} == START_TAG_TOKEN) {
871    
872     $self->{last_stag_name} = $self->{ct}->{tag_name};
873     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
874     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
875     #if ($self->{ct}->{attributes}) {
876     # ## NOTE: This state should never be reached.
877     # !!! cp (40);
878     # !!! parse-error (type => 'end tag attribute');
879     #} else {
880    
881     #}
882     } else {
883     die "$0: $self->{ct}->{type}: Unknown token type";
884     }
885     $self->{state} = DATA_STATE;
886     # reconsume
887    
888     return ($self->{ct}); # start tag or end tag
889    
890     redo A;
891     } elsif ($self->{nc} == 0x002F) { # /
892    
893     $self->{state} = SELF_CLOSING_START_TAG_STATE;
894    
895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
896     $self->{line_prev} = $self->{line};
897     $self->{column_prev} = $self->{column};
898     $self->{column}++;
899     $self->{nc}
900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
901     } else {
902     $self->{set_nc}->($self);
903     }
904    
905     redo A;
906     } else {
907    
908     $self->{ct}->{tag_name} .= chr $self->{nc};
909     # start tag or end tag
910     ## Stay in the state
911    
912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
913     $self->{line_prev} = $self->{line};
914     $self->{column_prev} = $self->{column};
915     $self->{column}++;
916     $self->{nc}
917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
918     } else {
919     $self->{set_nc}->($self);
920     }
921    
922     redo A;
923     }
924     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
925     if ($is_space->{$self->{nc}}) {
926    
927     ## Stay in the state
928    
929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
930     $self->{line_prev} = $self->{line};
931     $self->{column_prev} = $self->{column};
932     $self->{column}++;
933     $self->{nc}
934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
935     } else {
936     $self->{set_nc}->($self);
937     }
938    
939     redo A;
940     } elsif ($self->{nc} == 0x003E) { # >
941     if ($self->{ct}->{type} == START_TAG_TOKEN) {
942    
943     $self->{last_stag_name} = $self->{ct}->{tag_name};
944     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
945     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
946     if ($self->{ct}->{attributes}) {
947    
948     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
949     } else {
950    
951     }
952     } else {
953     die "$0: $self->{ct}->{type}: Unknown token type";
954     }
955     $self->{state} = DATA_STATE;
956    
957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
958     $self->{line_prev} = $self->{line};
959     $self->{column_prev} = $self->{column};
960     $self->{column}++;
961     $self->{nc}
962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
963     } else {
964     $self->{set_nc}->($self);
965     }
966    
967    
968     return ($self->{ct}); # start tag or end tag
969    
970     redo A;
971     } elsif (0x0041 <= $self->{nc} and
972     $self->{nc} <= 0x005A) { # A..Z
973    
974     $self->{ca}
975     = {name => chr ($self->{nc} + 0x0020),
976     value => '',
977     line => $self->{line}, column => $self->{column}};
978     $self->{state} = ATTRIBUTE_NAME_STATE;
979    
980     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
981     $self->{line_prev} = $self->{line};
982     $self->{column_prev} = $self->{column};
983     $self->{column}++;
984     $self->{nc}
985     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
986     } else {
987     $self->{set_nc}->($self);
988     }
989    
990     redo A;
991     } elsif ($self->{nc} == 0x002F) { # /
992    
993     $self->{state} = SELF_CLOSING_START_TAG_STATE;
994    
995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996     $self->{line_prev} = $self->{line};
997     $self->{column_prev} = $self->{column};
998     $self->{column}++;
999     $self->{nc}
1000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001     } else {
1002     $self->{set_nc}->($self);
1003     }
1004    
1005     redo A;
1006     } elsif ($self->{nc} == -1) {
1007     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1008     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1009    
1010     $self->{last_stag_name} = $self->{ct}->{tag_name};
1011     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1012     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1013     if ($self->{ct}->{attributes}) {
1014    
1015     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1016     } else {
1017    
1018     }
1019     } else {
1020     die "$0: $self->{ct}->{type}: Unknown token type";
1021     }
1022     $self->{state} = DATA_STATE;
1023     # reconsume
1024    
1025     return ($self->{ct}); # start tag or end tag
1026    
1027     redo A;
1028     } else {
1029     if ({
1030     0x0022 => 1, # "
1031     0x0027 => 1, # '
1032     0x003D => 1, # =
1033     }->{$self->{nc}}) {
1034    
1035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1036     } else {
1037    
1038     }
1039     $self->{ca}
1040     = {name => chr ($self->{nc}),
1041     value => '',
1042     line => $self->{line}, column => $self->{column}};
1043     $self->{state} = ATTRIBUTE_NAME_STATE;
1044    
1045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1046     $self->{line_prev} = $self->{line};
1047     $self->{column_prev} = $self->{column};
1048     $self->{column}++;
1049     $self->{nc}
1050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1051     } else {
1052     $self->{set_nc}->($self);
1053     }
1054    
1055     redo A;
1056     }
1057     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1058     my $before_leave = sub {
1059     if (exists $self->{ct}->{attributes} # start tag or end tag
1060     ->{$self->{ca}->{name}}) { # MUST
1061    
1062     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1063     ## Discard $self->{ca} # MUST
1064     } else {
1065    
1066     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1067     = $self->{ca};
1068     }
1069     }; # $before_leave
1070    
1071     if ($is_space->{$self->{nc}}) {
1072    
1073     $before_leave->();
1074     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1075    
1076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1077     $self->{line_prev} = $self->{line};
1078     $self->{column_prev} = $self->{column};
1079     $self->{column}++;
1080     $self->{nc}
1081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1082     } else {
1083     $self->{set_nc}->($self);
1084     }
1085    
1086     redo A;
1087     } elsif ($self->{nc} == 0x003D) { # =
1088    
1089     $before_leave->();
1090     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1091    
1092     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1093     $self->{line_prev} = $self->{line};
1094     $self->{column_prev} = $self->{column};
1095     $self->{column}++;
1096     $self->{nc}
1097     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1098     } else {
1099     $self->{set_nc}->($self);
1100     }
1101    
1102     redo A;
1103     } elsif ($self->{nc} == 0x003E) { # >
1104     $before_leave->();
1105     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1106    
1107     $self->{last_stag_name} = $self->{ct}->{tag_name};
1108     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1109    
1110     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1111     if ($self->{ct}->{attributes}) {
1112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1113     }
1114     } else {
1115     die "$0: $self->{ct}->{type}: Unknown token type";
1116     }
1117     $self->{state} = DATA_STATE;
1118    
1119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1120     $self->{line_prev} = $self->{line};
1121     $self->{column_prev} = $self->{column};
1122     $self->{column}++;
1123     $self->{nc}
1124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1125     } else {
1126     $self->{set_nc}->($self);
1127     }
1128    
1129    
1130     return ($self->{ct}); # start tag or end tag
1131    
1132     redo A;
1133     } elsif (0x0041 <= $self->{nc} and
1134     $self->{nc} <= 0x005A) { # A..Z
1135    
1136     $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1137     ## Stay in the state
1138    
1139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1140     $self->{line_prev} = $self->{line};
1141     $self->{column_prev} = $self->{column};
1142     $self->{column}++;
1143     $self->{nc}
1144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1145     } else {
1146     $self->{set_nc}->($self);
1147     }
1148    
1149     redo A;
1150     } elsif ($self->{nc} == 0x002F) { # /
1151    
1152     $before_leave->();
1153     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1154    
1155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1156     $self->{line_prev} = $self->{line};
1157     $self->{column_prev} = $self->{column};
1158     $self->{column}++;
1159     $self->{nc}
1160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1161     } else {
1162     $self->{set_nc}->($self);
1163     }
1164    
1165     redo A;
1166     } elsif ($self->{nc} == -1) {
1167     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1168     $before_leave->();
1169     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1170    
1171     $self->{last_stag_name} = $self->{ct}->{tag_name};
1172     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1173     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1174     if ($self->{ct}->{attributes}) {
1175    
1176     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1177     } else {
1178     ## NOTE: This state should never be reached.
1179    
1180     }
1181     } else {
1182     die "$0: $self->{ct}->{type}: Unknown token type";
1183     }
1184     $self->{state} = DATA_STATE;
1185     # reconsume
1186    
1187     return ($self->{ct}); # start tag or end tag
1188    
1189     redo A;
1190     } else {
1191     if ($self->{nc} == 0x0022 or # "
1192     $self->{nc} == 0x0027) { # '
1193    
1194     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1195     } else {
1196    
1197     }
1198     $self->{ca}->{name} .= chr ($self->{nc});
1199     ## Stay in the state
1200    
1201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1202     $self->{line_prev} = $self->{line};
1203     $self->{column_prev} = $self->{column};
1204     $self->{column}++;
1205     $self->{nc}
1206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1207     } else {
1208     $self->{set_nc}->($self);
1209     }
1210    
1211     redo A;
1212     }
1213     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1214     if ($is_space->{$self->{nc}}) {
1215    
1216     ## Stay in the state
1217    
1218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1219     $self->{line_prev} = $self->{line};
1220     $self->{column_prev} = $self->{column};
1221     $self->{column}++;
1222     $self->{nc}
1223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1224     } else {
1225     $self->{set_nc}->($self);
1226     }
1227    
1228     redo A;
1229     } elsif ($self->{nc} == 0x003D) { # =
1230    
1231     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1232    
1233     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1234     $self->{line_prev} = $self->{line};
1235     $self->{column_prev} = $self->{column};
1236     $self->{column}++;
1237     $self->{nc}
1238     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1239     } else {
1240     $self->{set_nc}->($self);
1241     }
1242    
1243     redo A;
1244     } elsif ($self->{nc} == 0x003E) { # >
1245     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1246    
1247     $self->{last_stag_name} = $self->{ct}->{tag_name};
1248     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1249     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1250     if ($self->{ct}->{attributes}) {
1251    
1252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1253     } else {
1254     ## NOTE: This state should never be reached.
1255    
1256     }
1257     } else {
1258     die "$0: $self->{ct}->{type}: Unknown token type";
1259     }
1260     $self->{state} = DATA_STATE;
1261    
1262     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1263     $self->{line_prev} = $self->{line};
1264     $self->{column_prev} = $self->{column};
1265     $self->{column}++;
1266     $self->{nc}
1267     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1268     } else {
1269     $self->{set_nc}->($self);
1270     }
1271    
1272    
1273     return ($self->{ct}); # start tag or end tag
1274    
1275     redo A;
1276     } elsif (0x0041 <= $self->{nc} and
1277     $self->{nc} <= 0x005A) { # A..Z
1278    
1279     $self->{ca}
1280     = {name => chr ($self->{nc} + 0x0020),
1281     value => '',
1282     line => $self->{line}, column => $self->{column}};
1283     $self->{state} = ATTRIBUTE_NAME_STATE;
1284    
1285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1286     $self->{line_prev} = $self->{line};
1287     $self->{column_prev} = $self->{column};
1288     $self->{column}++;
1289     $self->{nc}
1290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1291     } else {
1292     $self->{set_nc}->($self);
1293     }
1294    
1295     redo A;
1296     } elsif ($self->{nc} == 0x002F) { # /
1297    
1298     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1299    
1300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301     $self->{line_prev} = $self->{line};
1302     $self->{column_prev} = $self->{column};
1303     $self->{column}++;
1304     $self->{nc}
1305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306     } else {
1307     $self->{set_nc}->($self);
1308     }
1309    
1310     redo A;
1311     } elsif ($self->{nc} == -1) {
1312     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1313     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1314    
1315     $self->{last_stag_name} = $self->{ct}->{tag_name};
1316     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1317     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1318     if ($self->{ct}->{attributes}) {
1319    
1320     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1321     } else {
1322     ## NOTE: This state should never be reached.
1323    
1324     }
1325     } else {
1326     die "$0: $self->{ct}->{type}: Unknown token type";
1327     }
1328     $self->{state} = DATA_STATE;
1329     # reconsume
1330    
1331     return ($self->{ct}); # start tag or end tag
1332    
1333     redo A;
1334     } else {
1335     if ($self->{nc} == 0x0022 or # "
1336     $self->{nc} == 0x0027) { # '
1337    
1338     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1339     } else {
1340    
1341     }
1342     $self->{ca}
1343     = {name => chr ($self->{nc}),
1344     value => '',
1345     line => $self->{line}, column => $self->{column}};
1346     $self->{state} = ATTRIBUTE_NAME_STATE;
1347    
1348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1349     $self->{line_prev} = $self->{line};
1350     $self->{column_prev} = $self->{column};
1351     $self->{column}++;
1352     $self->{nc}
1353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1354     } else {
1355     $self->{set_nc}->($self);
1356     }
1357    
1358     redo A;
1359     }
1360     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1361     if ($is_space->{$self->{nc}}) {
1362    
1363     ## Stay in the state
1364    
1365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1366     $self->{line_prev} = $self->{line};
1367     $self->{column_prev} = $self->{column};
1368     $self->{column}++;
1369     $self->{nc}
1370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1371     } else {
1372     $self->{set_nc}->($self);
1373     }
1374    
1375     redo A;
1376     } elsif ($self->{nc} == 0x0022) { # "
1377    
1378     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1379    
1380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1381     $self->{line_prev} = $self->{line};
1382     $self->{column_prev} = $self->{column};
1383     $self->{column}++;
1384     $self->{nc}
1385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1386     } else {
1387     $self->{set_nc}->($self);
1388     }
1389    
1390     redo A;
1391     } elsif ($self->{nc} == 0x0026) { # &
1392    
1393     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1394     ## reconsume
1395     redo A;
1396     } elsif ($self->{nc} == 0x0027) { # '
1397    
1398     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1399    
1400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1401     $self->{line_prev} = $self->{line};
1402     $self->{column_prev} = $self->{column};
1403     $self->{column}++;
1404     $self->{nc}
1405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1406     } else {
1407     $self->{set_nc}->($self);
1408     }
1409    
1410     redo A;
1411     } elsif ($self->{nc} == 0x003E) { # >
1412     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1413     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1414    
1415     $self->{last_stag_name} = $self->{ct}->{tag_name};
1416     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1417     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1418     if ($self->{ct}->{attributes}) {
1419    
1420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1421     } else {
1422     ## NOTE: This state should never be reached.
1423    
1424     }
1425     } else {
1426     die "$0: $self->{ct}->{type}: Unknown token type";
1427     }
1428     $self->{state} = DATA_STATE;
1429    
1430     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1431     $self->{line_prev} = $self->{line};
1432     $self->{column_prev} = $self->{column};
1433     $self->{column}++;
1434     $self->{nc}
1435     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1436     } else {
1437     $self->{set_nc}->($self);
1438     }
1439    
1440    
1441     return ($self->{ct}); # start tag or end tag
1442    
1443     redo A;
1444     } elsif ($self->{nc} == -1) {
1445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1446     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1447    
1448     $self->{last_stag_name} = $self->{ct}->{tag_name};
1449     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1450     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1451     if ($self->{ct}->{attributes}) {
1452    
1453     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1454     } else {
1455     ## NOTE: This state should never be reached.
1456    
1457     }
1458     } else {
1459     die "$0: $self->{ct}->{type}: Unknown token type";
1460     }
1461     $self->{state} = DATA_STATE;
1462     ## reconsume
1463    
1464     return ($self->{ct}); # start tag or end tag
1465    
1466     redo A;
1467     } else {
1468     if ($self->{nc} == 0x003D) { # =
1469    
1470     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1471     } else {
1472    
1473     }
1474     $self->{ca}->{value} .= chr ($self->{nc});
1475     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1476    
1477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1478     $self->{line_prev} = $self->{line};
1479     $self->{column_prev} = $self->{column};
1480     $self->{column}++;
1481     $self->{nc}
1482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1483     } else {
1484     $self->{set_nc}->($self);
1485     }
1486    
1487     redo A;
1488     }
1489     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1490     if ($self->{nc} == 0x0022) { # "
1491    
1492     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1493    
1494     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1495     $self->{line_prev} = $self->{line};
1496     $self->{column_prev} = $self->{column};
1497     $self->{column}++;
1498     $self->{nc}
1499     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1500     } else {
1501     $self->{set_nc}->($self);
1502     }
1503    
1504     redo A;
1505     } elsif ($self->{nc} == 0x0026) { # &
1506    
1507     ## NOTE: In the spec, the tokenizer is switched to the
1508     ## "entity in attribute value state". In this implementation, the
1509     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1510     ## implementation of the "consume a character reference" algorithm.
1511     $self->{prev_state} = $self->{state};
1512     $self->{entity_add} = 0x0022; # "
1513     $self->{state} = ENTITY_STATE;
1514    
1515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1516     $self->{line_prev} = $self->{line};
1517     $self->{column_prev} = $self->{column};
1518     $self->{column}++;
1519     $self->{nc}
1520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1521     } else {
1522     $self->{set_nc}->($self);
1523     }
1524    
1525     redo A;
1526     } elsif ($self->{nc} == -1) {
1527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1528     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1529    
1530     $self->{last_stag_name} = $self->{ct}->{tag_name};
1531     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1532     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1533     if ($self->{ct}->{attributes}) {
1534    
1535     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1536     } else {
1537     ## NOTE: This state should never be reached.
1538    
1539     }
1540     } else {
1541     die "$0: $self->{ct}->{type}: Unknown token type";
1542     }
1543     $self->{state} = DATA_STATE;
1544     ## reconsume
1545    
1546     return ($self->{ct}); # start tag or end tag
1547    
1548     redo A;
1549     } else {
1550    
1551     $self->{ca}->{value} .= chr ($self->{nc});
1552     $self->{read_until}->($self->{ca}->{value},
1553     q["&],
1554     length $self->{ca}->{value});
1555    
1556     ## Stay in the state
1557    
1558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1559     $self->{line_prev} = $self->{line};
1560     $self->{column_prev} = $self->{column};
1561     $self->{column}++;
1562     $self->{nc}
1563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1564     } else {
1565     $self->{set_nc}->($self);
1566     }
1567    
1568     redo A;
1569     }
1570     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1571     if ($self->{nc} == 0x0027) { # '
1572    
1573     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1574    
1575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1576     $self->{line_prev} = $self->{line};
1577     $self->{column_prev} = $self->{column};
1578     $self->{column}++;
1579     $self->{nc}
1580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1581     } else {
1582     $self->{set_nc}->($self);
1583     }
1584    
1585     redo A;
1586     } elsif ($self->{nc} == 0x0026) { # &
1587    
1588     ## NOTE: In the spec, the tokenizer is switched to the
1589     ## "entity in attribute value state". In this implementation, the
1590     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1591     ## implementation of the "consume a character reference" algorithm.
1592     $self->{entity_add} = 0x0027; # '
1593     $self->{prev_state} = $self->{state};
1594     $self->{state} = ENTITY_STATE;
1595    
1596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1597     $self->{line_prev} = $self->{line};
1598     $self->{column_prev} = $self->{column};
1599     $self->{column}++;
1600     $self->{nc}
1601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1602     } else {
1603     $self->{set_nc}->($self);
1604     }
1605    
1606     redo A;
1607     } elsif ($self->{nc} == -1) {
1608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1609     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1610    
1611     $self->{last_stag_name} = $self->{ct}->{tag_name};
1612     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1613     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1614     if ($self->{ct}->{attributes}) {
1615    
1616     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1617     } else {
1618     ## NOTE: This state should never be reached.
1619    
1620     }
1621     } else {
1622     die "$0: $self->{ct}->{type}: Unknown token type";
1623     }
1624     $self->{state} = DATA_STATE;
1625     ## reconsume
1626    
1627     return ($self->{ct}); # start tag or end tag
1628    
1629     redo A;
1630     } else {
1631    
1632     $self->{ca}->{value} .= chr ($self->{nc});
1633     $self->{read_until}->($self->{ca}->{value},
1634     q['&],
1635     length $self->{ca}->{value});
1636    
1637     ## Stay in the state
1638    
1639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1640     $self->{line_prev} = $self->{line};
1641     $self->{column_prev} = $self->{column};
1642     $self->{column}++;
1643     $self->{nc}
1644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1645     } else {
1646     $self->{set_nc}->($self);
1647     }
1648    
1649     redo A;
1650     }
1651     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1652     if ($is_space->{$self->{nc}}) {
1653    
1654     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1655    
1656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1657     $self->{line_prev} = $self->{line};
1658     $self->{column_prev} = $self->{column};
1659     $self->{column}++;
1660     $self->{nc}
1661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1662     } else {
1663     $self->{set_nc}->($self);
1664     }
1665    
1666     redo A;
1667     } elsif ($self->{nc} == 0x0026) { # &
1668    
1669     ## NOTE: In the spec, the tokenizer is switched to the
1670     ## "entity in attribute value state". In this implementation, the
1671     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1672     ## implementation of the "consume a character reference" algorithm.
1673     $self->{entity_add} = -1;
1674     $self->{prev_state} = $self->{state};
1675     $self->{state} = ENTITY_STATE;
1676    
1677     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1678     $self->{line_prev} = $self->{line};
1679     $self->{column_prev} = $self->{column};
1680     $self->{column}++;
1681     $self->{nc}
1682     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1683     } else {
1684     $self->{set_nc}->($self);
1685     }
1686    
1687     redo A;
1688     } elsif ($self->{nc} == 0x003E) { # >
1689     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1690    
1691     $self->{last_stag_name} = $self->{ct}->{tag_name};
1692     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1693     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1694     if ($self->{ct}->{attributes}) {
1695    
1696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1697     } else {
1698     ## NOTE: This state should never be reached.
1699    
1700     }
1701     } else {
1702     die "$0: $self->{ct}->{type}: Unknown token type";
1703     }
1704     $self->{state} = DATA_STATE;
1705    
1706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1707     $self->{line_prev} = $self->{line};
1708     $self->{column_prev} = $self->{column};
1709     $self->{column}++;
1710     $self->{nc}
1711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1712     } else {
1713     $self->{set_nc}->($self);
1714     }
1715    
1716    
1717     return ($self->{ct}); # start tag or end tag
1718    
1719     redo A;
1720     } elsif ($self->{nc} == -1) {
1721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1722     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1723    
1724     $self->{last_stag_name} = $self->{ct}->{tag_name};
1725     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1726     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1727     if ($self->{ct}->{attributes}) {
1728    
1729     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1730     } else {
1731     ## NOTE: This state should never be reached.
1732    
1733     }
1734     } else {
1735     die "$0: $self->{ct}->{type}: Unknown token type";
1736     }
1737     $self->{state} = DATA_STATE;
1738     ## reconsume
1739    
1740     return ($self->{ct}); # start tag or end tag
1741    
1742     redo A;
1743     } else {
1744     if ({
1745     0x0022 => 1, # "
1746     0x0027 => 1, # '
1747     0x003D => 1, # =
1748     }->{$self->{nc}}) {
1749    
1750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1751     } else {
1752    
1753     }
1754     $self->{ca}->{value} .= chr ($self->{nc});
1755     $self->{read_until}->($self->{ca}->{value},
1756     q["'=& >],
1757     length $self->{ca}->{value});
1758    
1759     ## Stay in the state
1760    
1761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1762     $self->{line_prev} = $self->{line};
1763     $self->{column_prev} = $self->{column};
1764     $self->{column}++;
1765     $self->{nc}
1766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1767     } else {
1768     $self->{set_nc}->($self);
1769     }
1770    
1771     redo A;
1772     }
1773     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1774     if ($is_space->{$self->{nc}}) {
1775    
1776     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1777    
1778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1779     $self->{line_prev} = $self->{line};
1780     $self->{column_prev} = $self->{column};
1781     $self->{column}++;
1782     $self->{nc}
1783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1784     } else {
1785     $self->{set_nc}->($self);
1786     }
1787    
1788     redo A;
1789     } elsif ($self->{nc} == 0x003E) { # >
1790     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1791    
1792     $self->{last_stag_name} = $self->{ct}->{tag_name};
1793     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1794     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1795     if ($self->{ct}->{attributes}) {
1796    
1797     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1798     } else {
1799     ## NOTE: This state should never be reached.
1800    
1801     }
1802     } else {
1803     die "$0: $self->{ct}->{type}: Unknown token type";
1804     }
1805     $self->{state} = DATA_STATE;
1806    
1807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1808     $self->{line_prev} = $self->{line};
1809     $self->{column_prev} = $self->{column};
1810     $self->{column}++;
1811     $self->{nc}
1812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1813     } else {
1814     $self->{set_nc}->($self);
1815     }
1816    
1817    
1818     return ($self->{ct}); # start tag or end tag
1819    
1820     redo A;
1821     } elsif ($self->{nc} == 0x002F) { # /
1822    
1823     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1824    
1825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826     $self->{line_prev} = $self->{line};
1827     $self->{column_prev} = $self->{column};
1828     $self->{column}++;
1829     $self->{nc}
1830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831     } else {
1832     $self->{set_nc}->($self);
1833     }
1834    
1835     redo A;
1836     } elsif ($self->{nc} == -1) {
1837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1838     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839    
1840     $self->{last_stag_name} = $self->{ct}->{tag_name};
1841     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1842     if ($self->{ct}->{attributes}) {
1843    
1844     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1845     } else {
1846     ## NOTE: This state should never be reached.
1847    
1848     }
1849     } else {
1850     die "$0: $self->{ct}->{type}: Unknown token type";
1851     }
1852     $self->{state} = DATA_STATE;
1853     ## Reconsume.
1854     return ($self->{ct}); # start tag or end tag
1855     redo A;
1856     } else {
1857    
1858     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1859     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1860     ## reconsume
1861     redo A;
1862     }
1863     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1864     if ($self->{nc} == 0x003E) { # >
1865     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1866    
1867     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1868     ## TODO: Different type than slash in start tag
1869     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1870     if ($self->{ct}->{attributes}) {
1871    
1872     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1873     } else {
1874    
1875     }
1876     ## TODO: Test |<title></title/>|
1877     } else {
1878    
1879     $self->{self_closing} = 1;
1880     }
1881    
1882     $self->{state} = DATA_STATE;
1883    
1884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1885     $self->{line_prev} = $self->{line};
1886     $self->{column_prev} = $self->{column};
1887     $self->{column}++;
1888     $self->{nc}
1889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1890     } else {
1891     $self->{set_nc}->($self);
1892     }
1893    
1894    
1895     return ($self->{ct}); # start tag or end tag
1896    
1897     redo A;
1898     } elsif ($self->{nc} == -1) {
1899     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1900     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1901    
1902     $self->{last_stag_name} = $self->{ct}->{tag_name};
1903     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1904     if ($self->{ct}->{attributes}) {
1905    
1906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1907     } else {
1908     ## NOTE: This state should never be reached.
1909    
1910     }
1911     } else {
1912     die "$0: $self->{ct}->{type}: Unknown token type";
1913     }
1914     $self->{state} = DATA_STATE;
1915     ## Reconsume.
1916     return ($self->{ct}); # start tag or end tag
1917     redo A;
1918     } else {
1919    
1920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1921     ## TODO: This error type is wrong.
1922     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1923     ## Reconsume.
1924     redo A;
1925     }
1926     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1927     ## (only happen if PCDATA state)
1928    
1929     ## NOTE: Unlike spec's "bogus comment state", this implementation
1930     ## consumes characters one-by-one basis.
1931    
1932     if ($self->{nc} == 0x003E) { # >
1933    
1934     $self->{state} = DATA_STATE;
1935    
1936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1937     $self->{line_prev} = $self->{line};
1938     $self->{column_prev} = $self->{column};
1939     $self->{column}++;
1940     $self->{nc}
1941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1942     } else {
1943     $self->{set_nc}->($self);
1944     }
1945    
1946    
1947     return ($self->{ct}); # comment
1948     redo A;
1949     } elsif ($self->{nc} == -1) {
1950    
1951     $self->{state} = DATA_STATE;
1952     ## reconsume
1953    
1954     return ($self->{ct}); # comment
1955     redo A;
1956     } else {
1957    
1958     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1959     $self->{read_until}->($self->{ct}->{data},
1960     q[>],
1961     length $self->{ct}->{data});
1962    
1963     ## Stay in the state.
1964    
1965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1966     $self->{line_prev} = $self->{line};
1967     $self->{column_prev} = $self->{column};
1968     $self->{column}++;
1969     $self->{nc}
1970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1971     } else {
1972     $self->{set_nc}->($self);
1973     }
1974    
1975     redo A;
1976     }
1977     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1978     ## (only happen if PCDATA state)
1979    
1980     if ($self->{nc} == 0x002D) { # -
1981    
1982     $self->{state} = MD_HYPHEN_STATE;
1983    
1984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1985     $self->{line_prev} = $self->{line};
1986     $self->{column_prev} = $self->{column};
1987     $self->{column}++;
1988     $self->{nc}
1989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1990     } else {
1991     $self->{set_nc}->($self);
1992     }
1993    
1994     redo A;
1995     } elsif ($self->{nc} == 0x0044 or # D
1996     $self->{nc} == 0x0064) { # d
1997     ## ASCII case-insensitive.
1998    
1999     $self->{state} = MD_DOCTYPE_STATE;
2000     $self->{s_kwd} = chr $self->{nc};
2001    
2002     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2003     $self->{line_prev} = $self->{line};
2004     $self->{column_prev} = $self->{column};
2005     $self->{column}++;
2006     $self->{nc}
2007     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2008     } else {
2009     $self->{set_nc}->($self);
2010     }
2011    
2012     redo A;
2013     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2014     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2015     $self->{nc} == 0x005B) { # [
2016    
2017     $self->{state} = MD_CDATA_STATE;
2018     $self->{s_kwd} = '[';
2019    
2020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2021     $self->{line_prev} = $self->{line};
2022     $self->{column_prev} = $self->{column};
2023     $self->{column}++;
2024     $self->{nc}
2025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2026     } else {
2027     $self->{set_nc}->($self);
2028     }
2029    
2030     redo A;
2031     } else {
2032    
2033     }
2034    
2035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2036     line => $self->{line_prev},
2037     column => $self->{column_prev} - 1);
2038     ## Reconsume.
2039     $self->{state} = BOGUS_COMMENT_STATE;
2040     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2041     line => $self->{line_prev},
2042     column => $self->{column_prev} - 1,
2043     };
2044     redo A;
2045     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2046     if ($self->{nc} == 0x002D) { # -
2047    
2048     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2049     line => $self->{line_prev},
2050     column => $self->{column_prev} - 2,
2051     };
2052     $self->{state} = COMMENT_START_STATE;
2053    
2054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055     $self->{line_prev} = $self->{line};
2056     $self->{column_prev} = $self->{column};
2057     $self->{column}++;
2058     $self->{nc}
2059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060     } else {
2061     $self->{set_nc}->($self);
2062     }
2063    
2064     redo A;
2065     } else {
2066    
2067     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2068     line => $self->{line_prev},
2069     column => $self->{column_prev} - 2);
2070     $self->{state} = BOGUS_COMMENT_STATE;
2071     ## Reconsume.
2072     $self->{ct} = {type => COMMENT_TOKEN,
2073     data => '-',
2074     line => $self->{line_prev},
2075     column => $self->{column_prev} - 2,
2076     };
2077     redo A;
2078     }
2079     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2080     ## ASCII case-insensitive.
2081     if ($self->{nc} == [
2082     undef,
2083     0x004F, # O
2084     0x0043, # C
2085     0x0054, # T
2086     0x0059, # Y
2087     0x0050, # P
2088     ]->[length $self->{s_kwd}] or
2089     $self->{nc} == [
2090     undef,
2091     0x006F, # o
2092     0x0063, # c
2093     0x0074, # t
2094     0x0079, # y
2095     0x0070, # p
2096     ]->[length $self->{s_kwd}]) {
2097    
2098     ## Stay in the state.
2099     $self->{s_kwd} .= chr $self->{nc};
2100    
2101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2102     $self->{line_prev} = $self->{line};
2103     $self->{column_prev} = $self->{column};
2104     $self->{column}++;
2105     $self->{nc}
2106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2107     } else {
2108     $self->{set_nc}->($self);
2109     }
2110    
2111     redo A;
2112     } elsif ((length $self->{s_kwd}) == 6 and
2113     ($self->{nc} == 0x0045 or # E
2114     $self->{nc} == 0x0065)) { # e
2115    
2116     $self->{state} = DOCTYPE_STATE;
2117     $self->{ct} = {type => DOCTYPE_TOKEN,
2118     quirks => 1,
2119     line => $self->{line_prev},
2120     column => $self->{column_prev} - 7,
2121     };
2122    
2123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2124     $self->{line_prev} = $self->{line};
2125     $self->{column_prev} = $self->{column};
2126     $self->{column}++;
2127     $self->{nc}
2128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2129     } else {
2130     $self->{set_nc}->($self);
2131     }
2132    
2133     redo A;
2134     } else {
2135    
2136     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2137     line => $self->{line_prev},
2138     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2139     $self->{state} = BOGUS_COMMENT_STATE;
2140     ## Reconsume.
2141     $self->{ct} = {type => COMMENT_TOKEN,
2142     data => $self->{s_kwd},
2143     line => $self->{line_prev},
2144     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2145     };
2146     redo A;
2147     }
2148     } elsif ($self->{state} == MD_CDATA_STATE) {
2149     if ($self->{nc} == {
2150     '[' => 0x0043, # C
2151     '[C' => 0x0044, # D
2152     '[CD' => 0x0041, # A
2153     '[CDA' => 0x0054, # T
2154     '[CDAT' => 0x0041, # A
2155     }->{$self->{s_kwd}}) {
2156    
2157     ## Stay in the state.
2158     $self->{s_kwd} .= chr $self->{nc};
2159    
2160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2161     $self->{line_prev} = $self->{line};
2162     $self->{column_prev} = $self->{column};
2163     $self->{column}++;
2164     $self->{nc}
2165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2166     } else {
2167     $self->{set_nc}->($self);
2168     }
2169    
2170     redo A;
2171     } elsif ($self->{s_kwd} eq '[CDATA' and
2172     $self->{nc} == 0x005B) { # [
2173    
2174     $self->{ct} = {type => CHARACTER_TOKEN,
2175     data => '',
2176     line => $self->{line_prev},
2177     column => $self->{column_prev} - 7};
2178     $self->{state} = CDATA_SECTION_STATE;
2179    
2180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2181     $self->{line_prev} = $self->{line};
2182     $self->{column_prev} = $self->{column};
2183     $self->{column}++;
2184     $self->{nc}
2185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2186     } else {
2187     $self->{set_nc}->($self);
2188     }
2189    
2190     redo A;
2191     } else {
2192    
2193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2194     line => $self->{line_prev},
2195     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2196     $self->{state} = BOGUS_COMMENT_STATE;
2197     ## Reconsume.
2198     $self->{ct} = {type => COMMENT_TOKEN,
2199     data => $self->{s_kwd},
2200     line => $self->{line_prev},
2201     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2202     };
2203     redo A;
2204     }
2205     } elsif ($self->{state} == COMMENT_START_STATE) {
2206     if ($self->{nc} == 0x002D) { # -
2207    
2208     $self->{state} = COMMENT_START_DASH_STATE;
2209    
2210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211     $self->{line_prev} = $self->{line};
2212     $self->{column_prev} = $self->{column};
2213     $self->{column}++;
2214     $self->{nc}
2215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2216     } else {
2217     $self->{set_nc}->($self);
2218     }
2219    
2220     redo A;
2221     } elsif ($self->{nc} == 0x003E) { # >
2222    
2223     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2224     $self->{state} = DATA_STATE;
2225    
2226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2227     $self->{line_prev} = $self->{line};
2228     $self->{column_prev} = $self->{column};
2229     $self->{column}++;
2230     $self->{nc}
2231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2232     } else {
2233     $self->{set_nc}->($self);
2234     }
2235    
2236    
2237     return ($self->{ct}); # comment
2238    
2239     redo A;
2240     } elsif ($self->{nc} == -1) {
2241    
2242     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2243     $self->{state} = DATA_STATE;
2244     ## reconsume
2245    
2246     return ($self->{ct}); # comment
2247    
2248     redo A;
2249     } else {
2250    
2251     $self->{ct}->{data} # comment
2252     .= chr ($self->{nc});
2253     $self->{state} = COMMENT_STATE;
2254    
2255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2256     $self->{line_prev} = $self->{line};
2257     $self->{column_prev} = $self->{column};
2258     $self->{column}++;
2259     $self->{nc}
2260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2261     } else {
2262     $self->{set_nc}->($self);
2263     }
2264    
2265     redo A;
2266     }
2267     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2268     if ($self->{nc} == 0x002D) { # -
2269    
2270     $self->{state} = COMMENT_END_STATE;
2271    
2272     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2273     $self->{line_prev} = $self->{line};
2274     $self->{column_prev} = $self->{column};
2275     $self->{column}++;
2276     $self->{nc}
2277     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2278     } else {
2279     $self->{set_nc}->($self);
2280     }
2281    
2282     redo A;
2283     } elsif ($self->{nc} == 0x003E) { # >
2284    
2285     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2286     $self->{state} = DATA_STATE;
2287    
2288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2289     $self->{line_prev} = $self->{line};
2290     $self->{column_prev} = $self->{column};
2291     $self->{column}++;
2292     $self->{nc}
2293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2294     } else {
2295     $self->{set_nc}->($self);
2296     }
2297    
2298    
2299     return ($self->{ct}); # comment
2300    
2301     redo A;
2302     } elsif ($self->{nc} == -1) {
2303    
2304     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2305     $self->{state} = DATA_STATE;
2306     ## reconsume
2307    
2308     return ($self->{ct}); # comment
2309    
2310     redo A;
2311     } else {
2312    
2313     $self->{ct}->{data} # comment
2314     .= '-' . chr ($self->{nc});
2315     $self->{state} = COMMENT_STATE;
2316    
2317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2318     $self->{line_prev} = $self->{line};
2319     $self->{column_prev} = $self->{column};
2320     $self->{column}++;
2321     $self->{nc}
2322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2323     } else {
2324     $self->{set_nc}->($self);
2325     }
2326    
2327     redo A;
2328     }
2329     } elsif ($self->{state} == COMMENT_STATE) {
2330     if ($self->{nc} == 0x002D) { # -
2331    
2332     $self->{state} = COMMENT_END_DASH_STATE;
2333    
2334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2335     $self->{line_prev} = $self->{line};
2336     $self->{column_prev} = $self->{column};
2337     $self->{column}++;
2338     $self->{nc}
2339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2340     } else {
2341     $self->{set_nc}->($self);
2342     }
2343    
2344     redo A;
2345     } elsif ($self->{nc} == -1) {
2346    
2347     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2348     $self->{state} = DATA_STATE;
2349     ## reconsume
2350    
2351     return ($self->{ct}); # comment
2352    
2353     redo A;
2354     } else {
2355    
2356     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2357     $self->{read_until}->($self->{ct}->{data},
2358     q[-],
2359     length $self->{ct}->{data});
2360    
2361     ## Stay in the state
2362    
2363     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2364     $self->{line_prev} = $self->{line};
2365     $self->{column_prev} = $self->{column};
2366     $self->{column}++;
2367     $self->{nc}
2368     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2369     } else {
2370     $self->{set_nc}->($self);
2371     }
2372    
2373     redo A;
2374     }
2375     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2376     if ($self->{nc} == 0x002D) { # -
2377    
2378     $self->{state} = COMMENT_END_STATE;
2379    
2380     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2381     $self->{line_prev} = $self->{line};
2382     $self->{column_prev} = $self->{column};
2383     $self->{column}++;
2384     $self->{nc}
2385     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2386     } else {
2387     $self->{set_nc}->($self);
2388     }
2389    
2390     redo A;
2391     } elsif ($self->{nc} == -1) {
2392    
2393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2394     $self->{state} = DATA_STATE;
2395     ## reconsume
2396    
2397     return ($self->{ct}); # comment
2398    
2399     redo A;
2400     } else {
2401    
2402     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2403     $self->{state} = COMMENT_STATE;
2404    
2405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2406     $self->{line_prev} = $self->{line};
2407     $self->{column_prev} = $self->{column};
2408     $self->{column}++;
2409     $self->{nc}
2410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2411     } else {
2412     $self->{set_nc}->($self);
2413     }
2414    
2415     redo A;
2416     }
2417     } elsif ($self->{state} == COMMENT_END_STATE) {
2418     if ($self->{nc} == 0x003E) { # >
2419    
2420     $self->{state} = DATA_STATE;
2421    
2422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2423     $self->{line_prev} = $self->{line};
2424     $self->{column_prev} = $self->{column};
2425     $self->{column}++;
2426     $self->{nc}
2427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2428     } else {
2429     $self->{set_nc}->($self);
2430     }
2431    
2432    
2433     return ($self->{ct}); # comment
2434    
2435     redo A;
2436     } elsif ($self->{nc} == 0x002D) { # -
2437    
2438     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2439     line => $self->{line_prev},
2440     column => $self->{column_prev});
2441     $self->{ct}->{data} .= '-'; # comment
2442     ## Stay in the state
2443    
2444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2445     $self->{line_prev} = $self->{line};
2446     $self->{column_prev} = $self->{column};
2447     $self->{column}++;
2448     $self->{nc}
2449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2450     } else {
2451     $self->{set_nc}->($self);
2452     }
2453    
2454     redo A;
2455     } elsif ($self->{nc} == -1) {
2456    
2457     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2458     $self->{state} = DATA_STATE;
2459     ## reconsume
2460    
2461     return ($self->{ct}); # comment
2462    
2463     redo A;
2464     } else {
2465    
2466     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2467     line => $self->{line_prev},
2468     column => $self->{column_prev});
2469     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2470     $self->{state} = COMMENT_STATE;
2471    
2472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2473     $self->{line_prev} = $self->{line};
2474     $self->{column_prev} = $self->{column};
2475     $self->{column}++;
2476     $self->{nc}
2477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2478     } else {
2479     $self->{set_nc}->($self);
2480     }
2481    
2482     redo A;
2483     }
2484     } elsif ($self->{state} == DOCTYPE_STATE) {
2485     if ($is_space->{$self->{nc}}) {
2486    
2487     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2488    
2489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2490     $self->{line_prev} = $self->{line};
2491     $self->{column_prev} = $self->{column};
2492     $self->{column}++;
2493     $self->{nc}
2494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2495     } else {
2496     $self->{set_nc}->($self);
2497     }
2498    
2499     redo A;
2500     } else {
2501    
2502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2503     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2504     ## reconsume
2505     redo A;
2506     }
2507     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2508     if ($is_space->{$self->{nc}}) {
2509    
2510     ## Stay in the state
2511    
2512     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2513     $self->{line_prev} = $self->{line};
2514     $self->{column_prev} = $self->{column};
2515     $self->{column}++;
2516     $self->{nc}
2517     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2518     } else {
2519     $self->{set_nc}->($self);
2520     }
2521    
2522     redo A;
2523     } elsif ($self->{nc} == 0x003E) { # >
2524    
2525     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2526     $self->{state} = DATA_STATE;
2527    
2528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2529     $self->{line_prev} = $self->{line};
2530     $self->{column_prev} = $self->{column};
2531     $self->{column}++;
2532     $self->{nc}
2533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2534     } else {
2535     $self->{set_nc}->($self);
2536     }
2537    
2538    
2539     return ($self->{ct}); # DOCTYPE (quirks)
2540    
2541     redo A;
2542     } elsif ($self->{nc} == -1) {
2543    
2544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2545     $self->{state} = DATA_STATE;
2546     ## reconsume
2547    
2548     return ($self->{ct}); # DOCTYPE (quirks)
2549    
2550     redo A;
2551     } else {
2552    
2553     $self->{ct}->{name} = chr $self->{nc};
2554     delete $self->{ct}->{quirks};
2555     $self->{state} = DOCTYPE_NAME_STATE;
2556    
2557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2558     $self->{line_prev} = $self->{line};
2559     $self->{column_prev} = $self->{column};
2560     $self->{column}++;
2561     $self->{nc}
2562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2563     } else {
2564     $self->{set_nc}->($self);
2565     }
2566    
2567     redo A;
2568     }
2569     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2570     ## ISSUE: Redundant "First," in the spec.
2571     if ($is_space->{$self->{nc}}) {
2572    
2573     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2574    
2575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2576     $self->{line_prev} = $self->{line};
2577     $self->{column_prev} = $self->{column};
2578     $self->{column}++;
2579     $self->{nc}
2580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2581     } else {
2582     $self->{set_nc}->($self);
2583     }
2584    
2585     redo A;
2586     } elsif ($self->{nc} == 0x003E) { # >
2587    
2588     $self->{state} = DATA_STATE;
2589    
2590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2591     $self->{line_prev} = $self->{line};
2592     $self->{column_prev} = $self->{column};
2593     $self->{column}++;
2594     $self->{nc}
2595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2596     } else {
2597     $self->{set_nc}->($self);
2598     }
2599    
2600    
2601     return ($self->{ct}); # DOCTYPE
2602    
2603     redo A;
2604     } elsif ($self->{nc} == -1) {
2605    
2606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2607     $self->{state} = DATA_STATE;
2608     ## reconsume
2609    
2610     $self->{ct}->{quirks} = 1;
2611     return ($self->{ct}); # DOCTYPE
2612    
2613     redo A;
2614     } else {
2615    
2616     $self->{ct}->{name}
2617     .= chr ($self->{nc}); # DOCTYPE
2618     ## Stay in the state
2619    
2620     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2621     $self->{line_prev} = $self->{line};
2622     $self->{column_prev} = $self->{column};
2623     $self->{column}++;
2624     $self->{nc}
2625     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2626     } else {
2627     $self->{set_nc}->($self);
2628     }
2629    
2630     redo A;
2631     }
2632     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2633     if ($is_space->{$self->{nc}}) {
2634    
2635     ## Stay in the state
2636    
2637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638     $self->{line_prev} = $self->{line};
2639     $self->{column_prev} = $self->{column};
2640     $self->{column}++;
2641     $self->{nc}
2642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2643     } else {
2644     $self->{set_nc}->($self);
2645     }
2646    
2647     redo A;
2648     } elsif ($self->{nc} == 0x003E) { # >
2649    
2650     $self->{state} = DATA_STATE;
2651    
2652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2653     $self->{line_prev} = $self->{line};
2654     $self->{column_prev} = $self->{column};
2655     $self->{column}++;
2656     $self->{nc}
2657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2658     } else {
2659     $self->{set_nc}->($self);
2660     }
2661    
2662    
2663     return ($self->{ct}); # DOCTYPE
2664    
2665     redo A;
2666     } elsif ($self->{nc} == -1) {
2667    
2668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2669     $self->{state} = DATA_STATE;
2670     ## reconsume
2671    
2672     $self->{ct}->{quirks} = 1;
2673     return ($self->{ct}); # DOCTYPE
2674    
2675     redo A;
2676     } elsif ($self->{nc} == 0x0050 or # P
2677     $self->{nc} == 0x0070) { # p
2678     $self->{state} = PUBLIC_STATE;
2679     $self->{s_kwd} = chr $self->{nc};
2680    
2681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2682     $self->{line_prev} = $self->{line};
2683     $self->{column_prev} = $self->{column};
2684     $self->{column}++;
2685     $self->{nc}
2686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2687     } else {
2688     $self->{set_nc}->($self);
2689     }
2690    
2691     redo A;
2692     } elsif ($self->{nc} == 0x0053 or # S
2693     $self->{nc} == 0x0073) { # s
2694     $self->{state} = SYSTEM_STATE;
2695     $self->{s_kwd} = chr $self->{nc};
2696    
2697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2698     $self->{line_prev} = $self->{line};
2699     $self->{column_prev} = $self->{column};
2700     $self->{column}++;
2701     $self->{nc}
2702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2703     } else {
2704     $self->{set_nc}->($self);
2705     }
2706    
2707     redo A;
2708     } else {
2709    
2710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2711     $self->{ct}->{quirks} = 1;
2712    
2713     $self->{state} = BOGUS_DOCTYPE_STATE;
2714    
2715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2716     $self->{line_prev} = $self->{line};
2717     $self->{column_prev} = $self->{column};
2718     $self->{column}++;
2719     $self->{nc}
2720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2721     } else {
2722     $self->{set_nc}->($self);
2723     }
2724    
2725     redo A;
2726     }
2727     } elsif ($self->{state} == PUBLIC_STATE) {
2728     ## ASCII case-insensitive
2729     if ($self->{nc} == [
2730     undef,
2731     0x0055, # U
2732     0x0042, # B
2733     0x004C, # L
2734     0x0049, # I
2735     ]->[length $self->{s_kwd}] or
2736     $self->{nc} == [
2737     undef,
2738     0x0075, # u
2739     0x0062, # b
2740     0x006C, # l
2741     0x0069, # i
2742     ]->[length $self->{s_kwd}]) {
2743    
2744     ## Stay in the state.
2745     $self->{s_kwd} .= chr $self->{nc};
2746    
2747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2748     $self->{line_prev} = $self->{line};
2749     $self->{column_prev} = $self->{column};
2750     $self->{column}++;
2751     $self->{nc}
2752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2753     } else {
2754     $self->{set_nc}->($self);
2755     }
2756    
2757     redo A;
2758     } elsif ((length $self->{s_kwd}) == 5 and
2759     ($self->{nc} == 0x0043 or # C
2760     $self->{nc} == 0x0063)) { # c
2761    
2762     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2763    
2764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765     $self->{line_prev} = $self->{line};
2766     $self->{column_prev} = $self->{column};
2767     $self->{column}++;
2768     $self->{nc}
2769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770     } else {
2771     $self->{set_nc}->($self);
2772     }
2773    
2774     redo A;
2775     } else {
2776    
2777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2778     line => $self->{line_prev},
2779     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2780     $self->{ct}->{quirks} = 1;
2781    
2782     $self->{state} = BOGUS_DOCTYPE_STATE;
2783     ## Reconsume.
2784     redo A;
2785     }
2786     } elsif ($self->{state} == SYSTEM_STATE) {
2787     ## ASCII case-insensitive
2788     if ($self->{nc} == [
2789     undef,
2790     0x0059, # Y
2791     0x0053, # S
2792     0x0054, # T
2793     0x0045, # E
2794     ]->[length $self->{s_kwd}] or
2795     $self->{nc} == [
2796     undef,
2797     0x0079, # y
2798     0x0073, # s
2799     0x0074, # t
2800     0x0065, # e
2801     ]->[length $self->{s_kwd}]) {
2802    
2803     ## Stay in the state.
2804     $self->{s_kwd} .= chr $self->{nc};
2805    
2806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807     $self->{line_prev} = $self->{line};
2808     $self->{column_prev} = $self->{column};
2809     $self->{column}++;
2810     $self->{nc}
2811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812     } else {
2813     $self->{set_nc}->($self);
2814     }
2815    
2816     redo A;
2817     } elsif ((length $self->{s_kwd}) == 5 and
2818     ($self->{nc} == 0x004D or # M
2819     $self->{nc} == 0x006D)) { # m
2820    
2821     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2822    
2823     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2824     $self->{line_prev} = $self->{line};
2825     $self->{column_prev} = $self->{column};
2826     $self->{column}++;
2827     $self->{nc}
2828     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2829     } else {
2830     $self->{set_nc}->($self);
2831     }
2832    
2833     redo A;
2834     } else {
2835    
2836     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2837     line => $self->{line_prev},
2838     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2839     $self->{ct}->{quirks} = 1;
2840    
2841     $self->{state} = BOGUS_DOCTYPE_STATE;
2842     ## Reconsume.
2843     redo A;
2844     }
2845     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2846     if ($is_space->{$self->{nc}}) {
2847    
2848     ## Stay in the state
2849    
2850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2851     $self->{line_prev} = $self->{line};
2852     $self->{column_prev} = $self->{column};
2853     $self->{column}++;
2854     $self->{nc}
2855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2856     } else {
2857     $self->{set_nc}->($self);
2858     }
2859    
2860     redo A;
2861     } elsif ($self->{nc} eq 0x0022) { # "
2862    
2863     $self->{ct}->{pubid} = ''; # DOCTYPE
2864     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2865    
2866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2867     $self->{line_prev} = $self->{line};
2868     $self->{column_prev} = $self->{column};
2869     $self->{column}++;
2870     $self->{nc}
2871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2872     } else {
2873     $self->{set_nc}->($self);
2874     }
2875    
2876     redo A;
2877     } elsif ($self->{nc} eq 0x0027) { # '
2878    
2879     $self->{ct}->{pubid} = ''; # DOCTYPE
2880     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2881    
2882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2883     $self->{line_prev} = $self->{line};
2884     $self->{column_prev} = $self->{column};
2885     $self->{column}++;
2886     $self->{nc}
2887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2888     } else {
2889     $self->{set_nc}->($self);
2890     }
2891    
2892     redo A;
2893     } elsif ($self->{nc} eq 0x003E) { # >
2894    
2895     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2896    
2897     $self->{state} = DATA_STATE;
2898    
2899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2900     $self->{line_prev} = $self->{line};
2901     $self->{column_prev} = $self->{column};
2902     $self->{column}++;
2903     $self->{nc}
2904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2905     } else {
2906     $self->{set_nc}->($self);
2907     }
2908    
2909    
2910     $self->{ct}->{quirks} = 1;
2911     return ($self->{ct}); # DOCTYPE
2912    
2913     redo A;
2914     } elsif ($self->{nc} == -1) {
2915    
2916     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2917    
2918     $self->{state} = DATA_STATE;
2919     ## reconsume
2920    
2921     $self->{ct}->{quirks} = 1;
2922     return ($self->{ct}); # DOCTYPE
2923    
2924     redo A;
2925     } else {
2926    
2927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2928     $self->{ct}->{quirks} = 1;
2929    
2930     $self->{state} = BOGUS_DOCTYPE_STATE;
2931    
2932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2933     $self->{line_prev} = $self->{line};
2934     $self->{column_prev} = $self->{column};
2935     $self->{column}++;
2936     $self->{nc}
2937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2938     } else {
2939     $self->{set_nc}->($self);
2940     }
2941    
2942     redo A;
2943     }
2944     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2945     if ($self->{nc} == 0x0022) { # "
2946    
2947     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2948    
2949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2950     $self->{line_prev} = $self->{line};
2951     $self->{column_prev} = $self->{column};
2952     $self->{column}++;
2953     $self->{nc}
2954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2955     } else {
2956     $self->{set_nc}->($self);
2957     }
2958    
2959     redo A;
2960     } elsif ($self->{nc} == 0x003E) { # >
2961    
2962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2963    
2964     $self->{state} = DATA_STATE;
2965    
2966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2967     $self->{line_prev} = $self->{line};
2968     $self->{column_prev} = $self->{column};
2969     $self->{column}++;
2970     $self->{nc}
2971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2972     } else {
2973     $self->{set_nc}->($self);
2974     }
2975    
2976    
2977     $self->{ct}->{quirks} = 1;
2978     return ($self->{ct}); # DOCTYPE
2979    
2980     redo A;
2981     } elsif ($self->{nc} == -1) {
2982    
2983     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2984    
2985     $self->{state} = DATA_STATE;
2986     ## reconsume
2987    
2988     $self->{ct}->{quirks} = 1;
2989     return ($self->{ct}); # DOCTYPE
2990    
2991     redo A;
2992     } else {
2993    
2994     $self->{ct}->{pubid} # DOCTYPE
2995     .= chr $self->{nc};
2996     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2997     length $self->{ct}->{pubid});
2998    
2999     ## Stay in the state
3000    
3001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002     $self->{line_prev} = $self->{line};
3003     $self->{column_prev} = $self->{column};
3004     $self->{column}++;
3005     $self->{nc}
3006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007     } else {
3008     $self->{set_nc}->($self);
3009     }
3010    
3011     redo A;
3012     }
3013     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3014     if ($self->{nc} == 0x0027) { # '
3015    
3016     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3017    
3018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3019     $self->{line_prev} = $self->{line};
3020     $self->{column_prev} = $self->{column};
3021     $self->{column}++;
3022     $self->{nc}
3023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3024     } else {
3025     $self->{set_nc}->($self);
3026     }
3027    
3028     redo A;
3029     } elsif ($self->{nc} == 0x003E) { # >
3030    
3031     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3032    
3033     $self->{state} = DATA_STATE;
3034    
3035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036     $self->{line_prev} = $self->{line};
3037     $self->{column_prev} = $self->{column};
3038     $self->{column}++;
3039     $self->{nc}
3040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041     } else {
3042     $self->{set_nc}->($self);
3043     }
3044    
3045    
3046     $self->{ct}->{quirks} = 1;
3047     return ($self->{ct}); # DOCTYPE
3048    
3049     redo A;
3050     } elsif ($self->{nc} == -1) {
3051    
3052     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3053    
3054     $self->{state} = DATA_STATE;
3055     ## reconsume
3056    
3057     $self->{ct}->{quirks} = 1;
3058     return ($self->{ct}); # DOCTYPE
3059    
3060     redo A;
3061     } else {
3062    
3063     $self->{ct}->{pubid} # DOCTYPE
3064     .= chr $self->{nc};
3065     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3066     length $self->{ct}->{pubid});
3067    
3068     ## Stay in the state
3069    
3070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3071     $self->{line_prev} = $self->{line};
3072     $self->{column_prev} = $self->{column};
3073     $self->{column}++;
3074     $self->{nc}
3075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3076     } else {
3077     $self->{set_nc}->($self);
3078     }
3079    
3080     redo A;
3081     }
3082     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3083     if ($is_space->{$self->{nc}}) {
3084    
3085     ## Stay in the state
3086    
3087     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3088     $self->{line_prev} = $self->{line};
3089     $self->{column_prev} = $self->{column};
3090     $self->{column}++;
3091     $self->{nc}
3092     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3093     } else {
3094     $self->{set_nc}->($self);
3095     }
3096    
3097     redo A;
3098     } elsif ($self->{nc} == 0x0022) { # "
3099    
3100     $self->{ct}->{sysid} = ''; # DOCTYPE
3101     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3102    
3103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3104     $self->{line_prev} = $self->{line};
3105     $self->{column_prev} = $self->{column};
3106     $self->{column}++;
3107     $self->{nc}
3108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3109     } else {
3110     $self->{set_nc}->($self);
3111     }
3112    
3113     redo A;
3114     } elsif ($self->{nc} == 0x0027) { # '
3115    
3116     $self->{ct}->{sysid} = ''; # DOCTYPE
3117     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3118    
3119     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3120     $self->{line_prev} = $self->{line};
3121     $self->{column_prev} = $self->{column};
3122     $self->{column}++;
3123     $self->{nc}
3124     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3125     } else {
3126     $self->{set_nc}->($self);
3127     }
3128    
3129     redo A;
3130     } elsif ($self->{nc} == 0x003E) { # >
3131    
3132     $self->{state} = DATA_STATE;
3133    
3134     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3135     $self->{line_prev} = $self->{line};
3136     $self->{column_prev} = $self->{column};
3137     $self->{column}++;
3138     $self->{nc}
3139     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3140     } else {
3141     $self->{set_nc}->($self);
3142     }
3143    
3144    
3145     return ($self->{ct}); # DOCTYPE
3146    
3147     redo A;
3148     } elsif ($self->{nc} == -1) {
3149    
3150     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3151    
3152     $self->{state} = DATA_STATE;
3153     ## reconsume
3154    
3155     $self->{ct}->{quirks} = 1;
3156     return ($self->{ct}); # DOCTYPE
3157    
3158     redo A;
3159     } else {
3160    
3161     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3162     $self->{ct}->{quirks} = 1;
3163    
3164     $self->{state} = BOGUS_DOCTYPE_STATE;
3165    
3166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3167     $self->{line_prev} = $self->{line};
3168     $self->{column_prev} = $self->{column};
3169     $self->{column}++;
3170     $self->{nc}
3171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3172     } else {
3173     $self->{set_nc}->($self);
3174     }
3175    
3176     redo A;
3177     }
3178     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3179     if ($is_space->{$self->{nc}}) {
3180    
3181     ## Stay in the state
3182    
3183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3184     $self->{line_prev} = $self->{line};
3185     $self->{column_prev} = $self->{column};
3186     $self->{column}++;
3187     $self->{nc}
3188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3189     } else {
3190     $self->{set_nc}->($self);
3191     }
3192    
3193     redo A;
3194     } elsif ($self->{nc} == 0x0022) { # "
3195    
3196     $self->{ct}->{sysid} = ''; # DOCTYPE
3197     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3198    
3199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3200     $self->{line_prev} = $self->{line};
3201     $self->{column_prev} = $self->{column};
3202     $self->{column}++;
3203     $self->{nc}
3204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3205     } else {
3206     $self->{set_nc}->($self);
3207     }
3208    
3209     redo A;
3210     } elsif ($self->{nc} == 0x0027) { # '
3211    
3212     $self->{ct}->{sysid} = ''; # DOCTYPE
3213     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3214    
3215     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3216     $self->{line_prev} = $self->{line};
3217     $self->{column_prev} = $self->{column};
3218     $self->{column}++;
3219     $self->{nc}
3220     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3221     } else {
3222     $self->{set_nc}->($self);
3223     }
3224    
3225     redo A;
3226     } elsif ($self->{nc} == 0x003E) { # >
3227    
3228     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3229     $self->{state} = DATA_STATE;
3230    
3231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232     $self->{line_prev} = $self->{line};
3233     $self->{column_prev} = $self->{column};
3234     $self->{column}++;
3235     $self->{nc}
3236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3237     } else {
3238     $self->{set_nc}->($self);
3239     }
3240    
3241    
3242     $self->{ct}->{quirks} = 1;
3243     return ($self->{ct}); # DOCTYPE
3244    
3245     redo A;
3246     } elsif ($self->{nc} == -1) {
3247    
3248     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3249    
3250     $self->{state} = DATA_STATE;
3251     ## reconsume
3252    
3253     $self->{ct}->{quirks} = 1;
3254     return ($self->{ct}); # DOCTYPE
3255    
3256     redo A;
3257     } else {
3258    
3259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3260     $self->{ct}->{quirks} = 1;
3261    
3262     $self->{state} = BOGUS_DOCTYPE_STATE;
3263    
3264     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265     $self->{line_prev} = $self->{line};
3266     $self->{column_prev} = $self->{column};
3267     $self->{column}++;
3268     $self->{nc}
3269     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270     } else {
3271     $self->{set_nc}->($self);
3272     }
3273    
3274     redo A;
3275     }
3276     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3277     if ($self->{nc} == 0x0022) { # "
3278    
3279     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3280    
3281     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3282     $self->{line_prev} = $self->{line};
3283     $self->{column_prev} = $self->{column};
3284     $self->{column}++;
3285     $self->{nc}
3286     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3287     } else {
3288     $self->{set_nc}->($self);
3289     }
3290    
3291     redo A;
3292     } elsif ($self->{nc} == 0x003E) { # >
3293    
3294     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3295    
3296     $self->{state} = DATA_STATE;
3297    
3298     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3299     $self->{line_prev} = $self->{line};
3300     $self->{column_prev} = $self->{column};
3301     $self->{column}++;
3302     $self->{nc}
3303     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3304     } else {
3305     $self->{set_nc}->($self);
3306     }
3307    
3308    
3309     $self->{ct}->{quirks} = 1;
3310     return ($self->{ct}); # DOCTYPE
3311    
3312     redo A;
3313     } elsif ($self->{nc} == -1) {
3314    
3315     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3316    
3317     $self->{state} = DATA_STATE;
3318     ## reconsume
3319    
3320     $self->{ct}->{quirks} = 1;
3321     return ($self->{ct}); # DOCTYPE
3322    
3323     redo A;
3324     } else {
3325    
3326     $self->{ct}->{sysid} # DOCTYPE
3327     .= chr $self->{nc};
3328     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3329     length $self->{ct}->{sysid});
3330    
3331     ## Stay in the state
3332    
3333     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3334     $self->{line_prev} = $self->{line};
3335     $self->{column_prev} = $self->{column};
3336     $self->{column}++;
3337     $self->{nc}
3338     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3339     } else {
3340     $self->{set_nc}->($self);
3341     }
3342    
3343     redo A;
3344     }
3345     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3346     if ($self->{nc} == 0x0027) { # '
3347    
3348     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3349    
3350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3351     $self->{line_prev} = $self->{line};
3352     $self->{column_prev} = $self->{column};
3353     $self->{column}++;
3354     $self->{nc}
3355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3356     } else {
3357     $self->{set_nc}->($self);
3358     }
3359    
3360     redo A;
3361     } elsif ($self->{nc} == 0x003E) { # >
3362    
3363     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3364    
3365     $self->{state} = DATA_STATE;
3366    
3367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3368     $self->{line_prev} = $self->{line};
3369     $self->{column_prev} = $self->{column};
3370     $self->{column}++;
3371     $self->{nc}
3372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3373     } else {
3374     $self->{set_nc}->($self);
3375     }
3376    
3377    
3378     $self->{ct}->{quirks} = 1;
3379     return ($self->{ct}); # DOCTYPE
3380    
3381     redo A;
3382     } elsif ($self->{nc} == -1) {
3383    
3384     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3385    
3386     $self->{state} = DATA_STATE;
3387     ## reconsume
3388    
3389     $self->{ct}->{quirks} = 1;
3390     return ($self->{ct}); # DOCTYPE
3391    
3392     redo A;
3393     } else {
3394    
3395     $self->{ct}->{sysid} # DOCTYPE
3396     .= chr $self->{nc};
3397     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3398     length $self->{ct}->{sysid});
3399    
3400     ## Stay in the state
3401    
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412     redo A;
3413     }
3414     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3415     if ($is_space->{$self->{nc}}) {
3416    
3417     ## Stay in the state
3418    
3419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3420     $self->{line_prev} = $self->{line};
3421     $self->{column_prev} = $self->{column};
3422     $self->{column}++;
3423     $self->{nc}
3424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3425     } else {
3426     $self->{set_nc}->($self);
3427     }
3428    
3429     redo A;
3430     } elsif ($self->{nc} == 0x003E) { # >
3431    
3432     $self->{state} = DATA_STATE;
3433    
3434     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3435     $self->{line_prev} = $self->{line};
3436     $self->{column_prev} = $self->{column};
3437     $self->{column}++;
3438     $self->{nc}
3439     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3440     } else {
3441     $self->{set_nc}->($self);
3442     }
3443    
3444    
3445     return ($self->{ct}); # DOCTYPE
3446    
3447     redo A;
3448     } elsif ($self->{nc} == -1) {
3449    
3450     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3451     $self->{state} = DATA_STATE;
3452     ## reconsume
3453    
3454     $self->{ct}->{quirks} = 1;
3455     return ($self->{ct}); # DOCTYPE
3456    
3457     redo A;
3458     } else {
3459    
3460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3461     #$self->{ct}->{quirks} = 1;
3462    
3463     $self->{state} = BOGUS_DOCTYPE_STATE;
3464    
3465     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3466     $self->{line_prev} = $self->{line};
3467     $self->{column_prev} = $self->{column};
3468     $self->{column}++;
3469     $self->{nc}
3470     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3471     } else {
3472     $self->{set_nc}->($self);
3473     }
3474    
3475     redo A;
3476     }
3477     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3478     if ($self->{nc} == 0x003E) { # >
3479    
3480     $self->{state} = DATA_STATE;
3481    
3482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3483     $self->{line_prev} = $self->{line};
3484     $self->{column_prev} = $self->{column};
3485     $self->{column}++;
3486     $self->{nc}
3487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3488     } else {
3489     $self->{set_nc}->($self);
3490     }
3491    
3492    
3493     return ($self->{ct}); # DOCTYPE
3494    
3495     redo A;
3496     } elsif ($self->{nc} == -1) {
3497    
3498     $self->{state} = DATA_STATE;
3499     ## reconsume
3500    
3501     return ($self->{ct}); # DOCTYPE
3502    
3503     redo A;
3504     } else {
3505    
3506     my $s = '';
3507     $self->{read_until}->($s, q[>], 0);
3508    
3509     ## Stay in the state
3510    
3511     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3512     $self->{line_prev} = $self->{line};
3513     $self->{column_prev} = $self->{column};
3514     $self->{column}++;
3515     $self->{nc}
3516     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3517     } else {
3518     $self->{set_nc}->($self);
3519     }
3520    
3521     redo A;
3522     }
3523     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3524     ## NOTE: "CDATA section state" in the state is jointly implemented
3525     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3526     ## and |CDATA_SECTION_MSE2_STATE|.
3527    
3528     if ($self->{nc} == 0x005D) { # ]
3529    
3530     $self->{state} = CDATA_SECTION_MSE1_STATE;
3531    
3532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3533     $self->{line_prev} = $self->{line};
3534     $self->{column_prev} = $self->{column};
3535     $self->{column}++;
3536     $self->{nc}
3537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3538     } else {
3539     $self->{set_nc}->($self);
3540     }
3541    
3542     redo A;
3543     } elsif ($self->{nc} == -1) {
3544     $self->{state} = DATA_STATE;
3545    
3546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3547     $self->{line_prev} = $self->{line};
3548     $self->{column_prev} = $self->{column};
3549     $self->{column}++;
3550     $self->{nc}
3551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3552     } else {
3553     $self->{set_nc}->($self);
3554     }
3555    
3556     if (length $self->{ct}->{data}) { # character
3557    
3558     return ($self->{ct}); # character
3559     } else {
3560    
3561     ## No token to emit. $self->{ct} is discarded.
3562     }
3563     redo A;
3564     } else {
3565    
3566     $self->{ct}->{data} .= chr $self->{nc};
3567     $self->{read_until}->($self->{ct}->{data},
3568     q<]>,
3569     length $self->{ct}->{data});
3570    
3571     ## Stay in the state.
3572    
3573     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574     $self->{line_prev} = $self->{line};
3575     $self->{column_prev} = $self->{column};
3576     $self->{column}++;
3577     $self->{nc}
3578     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579     } else {
3580     $self->{set_nc}->($self);
3581     }
3582    
3583     redo A;
3584     }
3585    
3586     ## ISSUE: "text tokens" in spec.
3587     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3588     if ($self->{nc} == 0x005D) { # ]
3589    
3590     $self->{state} = CDATA_SECTION_MSE2_STATE;
3591    
3592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3593     $self->{line_prev} = $self->{line};
3594     $self->{column_prev} = $self->{column};
3595     $self->{column}++;
3596     $self->{nc}
3597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3598     } else {
3599     $self->{set_nc}->($self);
3600     }
3601    
3602     redo A;
3603     } else {
3604    
3605     $self->{ct}->{data} .= ']';
3606     $self->{state} = CDATA_SECTION_STATE;
3607     ## Reconsume.
3608     redo A;
3609     }
3610     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3611     if ($self->{nc} == 0x003E) { # >
3612     $self->{state} = DATA_STATE;
3613    
3614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3615     $self->{line_prev} = $self->{line};
3616     $self->{column_prev} = $self->{column};
3617     $self->{column}++;
3618     $self->{nc}
3619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3620     } else {
3621     $self->{set_nc}->($self);
3622     }
3623    
3624     if (length $self->{ct}->{data}) { # character
3625    
3626     return ($self->{ct}); # character
3627     } else {
3628    
3629     ## No token to emit. $self->{ct} is discarded.
3630     }
3631     redo A;
3632     } elsif ($self->{nc} == 0x005D) { # ]
3633     # character
3634     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3635     ## Stay in the state.
3636    
3637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3638     $self->{line_prev} = $self->{line};
3639     $self->{column_prev} = $self->{column};
3640     $self->{column}++;
3641     $self->{nc}
3642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3643     } else {
3644     $self->{set_nc}->($self);
3645     }
3646    
3647     redo A;
3648     } else {
3649    
3650     $self->{ct}->{data} .= ']]'; # character
3651     $self->{state} = CDATA_SECTION_STATE;
3652     ## Reconsume.
3653     redo A;
3654     }
3655     } elsif ($self->{state} == ENTITY_STATE) {
3656     if ($is_space->{$self->{nc}} or
3657     {
3658     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3659     $self->{entity_add} => 1,
3660     }->{$self->{nc}}) {
3661    
3662     ## Don't consume
3663     ## No error
3664     ## Return nothing.
3665     #
3666     } elsif ($self->{nc} == 0x0023) { # #
3667    
3668     $self->{state} = ENTITY_HASH_STATE;
3669     $self->{s_kwd} = '#';
3670    
3671     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3672     $self->{line_prev} = $self->{line};
3673     $self->{column_prev} = $self->{column};
3674     $self->{column}++;
3675     $self->{nc}
3676     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3677     } else {
3678     $self->{set_nc}->($self);
3679     }
3680    
3681     redo A;
3682     } elsif ((0x0041 <= $self->{nc} and
3683     $self->{nc} <= 0x005A) or # A..Z
3684     (0x0061 <= $self->{nc} and
3685     $self->{nc} <= 0x007A)) { # a..z
3686    
3687     require Whatpm::_NamedEntityList;
3688     $self->{state} = ENTITY_NAME_STATE;
3689     $self->{s_kwd} = chr $self->{nc};
3690     $self->{entity__value} = $self->{s_kwd};
3691     $self->{entity__match} = 0;
3692    
3693     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3694     $self->{line_prev} = $self->{line};
3695     $self->{column_prev} = $self->{column};
3696     $self->{column}++;
3697     $self->{nc}
3698     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3699     } else {
3700     $self->{set_nc}->($self);
3701     }
3702    
3703     redo A;
3704     } else {
3705    
3706     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3707     ## Return nothing.
3708     #
3709     }
3710    
3711     ## NOTE: No character is consumed by the "consume a character
3712     ## reference" algorithm. In other word, there is an "&" character
3713     ## that does not introduce a character reference, which would be
3714     ## appended to the parent element or the attribute value in later
3715     ## process of the tokenizer.
3716    
3717     if ($self->{prev_state} == DATA_STATE) {
3718    
3719     $self->{state} = $self->{prev_state};
3720     ## Reconsume.
3721     return ({type => CHARACTER_TOKEN, data => '&',
3722     line => $self->{line_prev},
3723     column => $self->{column_prev},
3724     });
3725     redo A;
3726     } else {
3727    
3728     $self->{ca}->{value} .= '&';
3729     $self->{state} = $self->{prev_state};
3730     ## Reconsume.
3731     redo A;
3732     }
3733     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3734     if ($self->{nc} == 0x0078 or # x
3735     $self->{nc} == 0x0058) { # X
3736    
3737     $self->{state} = HEXREF_X_STATE;
3738     $self->{s_kwd} .= chr $self->{nc};
3739    
3740     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3741     $self->{line_prev} = $self->{line};
3742     $self->{column_prev} = $self->{column};
3743     $self->{column}++;
3744     $self->{nc}
3745     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3746     } else {
3747     $self->{set_nc}->($self);
3748     }
3749    
3750     redo A;
3751     } elsif (0x0030 <= $self->{nc} and
3752     $self->{nc} <= 0x0039) { # 0..9
3753    
3754     $self->{state} = NCR_NUM_STATE;
3755     $self->{s_kwd} = $self->{nc} - 0x0030;
3756    
3757     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3758     $self->{line_prev} = $self->{line};
3759     $self->{column_prev} = $self->{column};
3760     $self->{column}++;
3761     $self->{nc}
3762     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3763     } else {
3764     $self->{set_nc}->($self);
3765     }
3766    
3767     redo A;
3768     } else {
3769     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3770     line => $self->{line_prev},
3771     column => $self->{column_prev} - 1);
3772    
3773     ## NOTE: According to the spec algorithm, nothing is returned,
3774     ## and then "&#" is appended to the parent element or the attribute
3775     ## value in the later processing.
3776    
3777     if ($self->{prev_state} == DATA_STATE) {
3778    
3779     $self->{state} = $self->{prev_state};
3780     ## Reconsume.
3781     return ({type => CHARACTER_TOKEN,
3782     data => '&#',
3783     line => $self->{line_prev},
3784     column => $self->{column_prev} - 1,
3785     });
3786     redo A;
3787     } else {
3788    
3789     $self->{ca}->{value} .= '&#';
3790     $self->{state} = $self->{prev_state};
3791     ## Reconsume.
3792     redo A;
3793     }
3794     }
3795     } elsif ($self->{state} == NCR_NUM_STATE) {
3796     if (0x0030 <= $self->{nc} and
3797     $self->{nc} <= 0x0039) { # 0..9
3798    
3799     $self->{s_kwd} *= 10;
3800     $self->{s_kwd} += $self->{nc} - 0x0030;
3801    
3802     ## Stay in the state.
3803    
3804     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3805     $self->{line_prev} = $self->{line};
3806     $self->{column_prev} = $self->{column};
3807     $self->{column}++;
3808     $self->{nc}
3809     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3810     } else {
3811     $self->{set_nc}->($self);
3812     }
3813    
3814     redo A;
3815     } elsif ($self->{nc} == 0x003B) { # ;
3816    
3817    
3818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3819     $self->{line_prev} = $self->{line};
3820     $self->{column_prev} = $self->{column};
3821     $self->{column}++;
3822     $self->{nc}
3823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3824     } else {
3825     $self->{set_nc}->($self);
3826     }
3827    
3828     #
3829     } else {
3830    
3831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3832     ## Reconsume.
3833     #
3834     }
3835    
3836     my $code = $self->{s_kwd};
3837     my $l = $self->{line_prev};
3838     my $c = $self->{column_prev};
3839     if ($charref_map->{$code}) {
3840    
3841     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3842     text => (sprintf 'U+%04X', $code),
3843     line => $l, column => $c);
3844     $code = $charref_map->{$code};
3845     } elsif ($code > 0x10FFFF) {
3846    
3847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3848     text => (sprintf 'U-%08X', $code),
3849     line => $l, column => $c);
3850     $code = 0xFFFD;
3851     }
3852    
3853     if ($self->{prev_state} == DATA_STATE) {
3854    
3855     $self->{state} = $self->{prev_state};
3856     ## Reconsume.
3857     return ({type => CHARACTER_TOKEN, data => chr $code,
3858     line => $l, column => $c,
3859     });
3860     redo A;
3861     } else {
3862    
3863     $self->{ca}->{value} .= chr $code;
3864     $self->{ca}->{has_reference} = 1;
3865     $self->{state} = $self->{prev_state};
3866     ## Reconsume.
3867     redo A;
3868     }
3869     } elsif ($self->{state} == HEXREF_X_STATE) {
3870     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3871     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3872     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3873     # 0..9, A..F, a..f
3874    
3875     $self->{state} = HEXREF_HEX_STATE;
3876     $self->{s_kwd} = 0;
3877     ## Reconsume.
3878     redo A;
3879     } else {
3880     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3881     line => $self->{line_prev},
3882     column => $self->{column_prev} - 2);
3883    
3884     ## NOTE: According to the spec algorithm, nothing is returned,
3885     ## and then "&#" followed by "X" or "x" is appended to the parent
3886     ## element or the attribute value in the later processing.
3887    
3888     if ($self->{prev_state} == DATA_STATE) {
3889    
3890     $self->{state} = $self->{prev_state};
3891     ## Reconsume.
3892     return ({type => CHARACTER_TOKEN,
3893     data => '&' . $self->{s_kwd},
3894     line => $self->{line_prev},
3895     column => $self->{column_prev} - length $self->{s_kwd},
3896     });
3897     redo A;
3898     } else {
3899    
3900     $self->{ca}->{value} .= '&' . $self->{s_kwd};
3901     $self->{state} = $self->{prev_state};
3902     ## Reconsume.
3903     redo A;
3904     }
3905     }
3906     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3907     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3908     # 0..9
3909    
3910     $self->{s_kwd} *= 0x10;
3911     $self->{s_kwd} += $self->{nc} - 0x0030;
3912     ## Stay in the state.
3913    
3914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915     $self->{line_prev} = $self->{line};
3916     $self->{column_prev} = $self->{column};
3917     $self->{column}++;
3918     $self->{nc}
3919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920     } else {
3921     $self->{set_nc}->($self);
3922     }
3923    
3924     redo A;
3925     } elsif (0x0061 <= $self->{nc} and
3926     $self->{nc} <= 0x0066) { # a..f
3927    
3928     $self->{s_kwd} *= 0x10;
3929     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3930     ## Stay in the state.
3931    
3932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3933     $self->{line_prev} = $self->{line};
3934     $self->{column_prev} = $self->{column};
3935     $self->{column}++;
3936     $self->{nc}
3937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3938     } else {
3939     $self->{set_nc}->($self);
3940     }
3941    
3942     redo A;
3943     } elsif (0x0041 <= $self->{nc} and
3944     $self->{nc} <= 0x0046) { # A..F
3945    
3946     $self->{s_kwd} *= 0x10;
3947     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3948     ## Stay in the state.
3949    
3950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3951     $self->{line_prev} = $self->{line};
3952     $self->{column_prev} = $self->{column};
3953     $self->{column}++;
3954     $self->{nc}
3955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3956     } else {
3957     $self->{set_nc}->($self);
3958     }
3959    
3960     redo A;
3961     } elsif ($self->{nc} == 0x003B) { # ;
3962    
3963    
3964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3965     $self->{line_prev} = $self->{line};
3966     $self->{column_prev} = $self->{column};
3967     $self->{column}++;
3968     $self->{nc}
3969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3970     } else {
3971     $self->{set_nc}->($self);
3972     }
3973    
3974     #
3975     } else {
3976    
3977     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3978     line => $self->{line},
3979     column => $self->{column});
3980     ## Reconsume.
3981     #
3982     }
3983    
3984     my $code = $self->{s_kwd};
3985     my $l = $self->{line_prev};
3986     my $c = $self->{column_prev};
3987     if ($charref_map->{$code}) {
3988    
3989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3990     text => (sprintf 'U+%04X', $code),
3991     line => $l, column => $c);
3992     $code = $charref_map->{$code};
3993     } elsif ($code > 0x10FFFF) {
3994    
3995     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3996     text => (sprintf 'U-%08X', $code),
3997     line => $l, column => $c);
3998     $code = 0xFFFD;
3999     }
4000    
4001     if ($self->{prev_state} == DATA_STATE) {
4002    
4003     $self->{state} = $self->{prev_state};
4004     ## Reconsume.
4005     return ({type => CHARACTER_TOKEN, data => chr $code,
4006     line => $l, column => $c,
4007     });
4008     redo A;
4009     } else {
4010    
4011     $self->{ca}->{value} .= chr $code;
4012     $self->{ca}->{has_reference} = 1;
4013     $self->{state} = $self->{prev_state};
4014     ## Reconsume.
4015     redo A;
4016     }
4017     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4018     if (length $self->{s_kwd} < 30 and
4019     ## NOTE: Some number greater than the maximum length of entity name
4020     ((0x0041 <= $self->{nc} and # a
4021     $self->{nc} <= 0x005A) or # x
4022     (0x0061 <= $self->{nc} and # a
4023     $self->{nc} <= 0x007A) or # z
4024     (0x0030 <= $self->{nc} and # 0
4025     $self->{nc} <= 0x0039) or # 9
4026     $self->{nc} == 0x003B)) { # ;
4027     our $EntityChar;
4028     $self->{s_kwd} .= chr $self->{nc};
4029     if (defined $EntityChar->{$self->{s_kwd}}) {
4030     if ($self->{nc} == 0x003B) { # ;
4031    
4032     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4033     $self->{entity__match} = 1;
4034    
4035     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036     $self->{line_prev} = $self->{line};
4037     $self->{column_prev} = $self->{column};
4038     $self->{column}++;
4039     $self->{nc}
4040     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041     } else {
4042     $self->{set_nc}->($self);
4043     }
4044    
4045     #
4046     } else {
4047    
4048     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4049     $self->{entity__match} = -1;
4050     ## Stay in the state.
4051    
4052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4053     $self->{line_prev} = $self->{line};
4054     $self->{column_prev} = $self->{column};
4055     $self->{column}++;
4056     $self->{nc}
4057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4058     } else {
4059     $self->{set_nc}->($self);
4060     }
4061    
4062     redo A;
4063     }
4064     } else {
4065    
4066     $self->{entity__value} .= chr $self->{nc};
4067     $self->{entity__match} *= 2;
4068     ## Stay in the state.
4069    
4070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4071     $self->{line_prev} = $self->{line};
4072     $self->{column_prev} = $self->{column};
4073     $self->{column}++;
4074     $self->{nc}
4075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4076     } else {
4077     $self->{set_nc}->($self);
4078     }
4079    
4080     redo A;
4081     }
4082     }
4083    
4084     my $data;
4085     my $has_ref;
4086     if ($self->{entity__match} > 0) {
4087    
4088     $data = $self->{entity__value};
4089     $has_ref = 1;
4090     #
4091     } elsif ($self->{entity__match} < 0) {
4092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4093     if ($self->{prev_state} != DATA_STATE and # in attribute
4094     $self->{entity__match} < -1) {
4095    
4096     $data = '&' . $self->{s_kwd};
4097     #
4098     } else {
4099    
4100     $data = $self->{entity__value};
4101     $has_ref = 1;
4102     #
4103     }
4104     } else {
4105    
4106     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4107     line => $self->{line_prev},
4108     column => $self->{column_prev} - length $self->{s_kwd});
4109     $data = '&' . $self->{s_kwd};
4110     #
4111     }
4112    
4113     ## NOTE: In these cases, when a character reference is found,
4114     ## it is consumed and a character token is returned, or, otherwise,
4115     ## nothing is consumed and returned, according to the spec algorithm.
4116     ## In this implementation, anything that has been examined by the
4117     ## tokenizer is appended to the parent element or the attribute value
4118     ## as string, either literal string when no character reference or
4119     ## entity-replaced string otherwise, in this stage, since any characters
4120     ## that would not be consumed are appended in the data state or in an
4121     ## appropriate attribute value state anyway.
4122    
4123     if ($self->{prev_state} == DATA_STATE) {
4124    
4125     $self->{state} = $self->{prev_state};
4126     ## Reconsume.
4127     return ({type => CHARACTER_TOKEN,
4128     data => $data,
4129     line => $self->{line_prev},
4130     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4131     });
4132     redo A;
4133     } else {
4134    
4135     $self->{ca}->{value} .= $data;
4136     $self->{ca}->{has_reference} = 1 if $has_ref;
4137     $self->{state} = $self->{prev_state};
4138     ## Reconsume.
4139     redo A;
4140     }
4141     } else {
4142     die "$0: $self->{state}: Unknown state";
4143     }
4144     } # A
4145    
4146     die "$0: _get_next_token: unexpected case";
4147     } # _get_next_token
4148    
4149     1;
4150 wakaba 1.2 ## $Date: 2008/10/14 02:27:58 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24