/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (hide annotations) (download)
Tue Oct 14 11:46:57 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.3: +10 -8 lines
++ whatpm/t/ChangeLog	14 Oct 2008 11:46:38 -0000
	* XML-Parser.t: "xml/elements-1.dat" and "xml/doctypes-1.dat"
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 11:46:52 -0000
	* elements-1.dat: New test data file.

	* doctypes-1.dat: New test data file.

	* attrs-1.dat: New test data on attribute name cases are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 11:40:58 -0000
	* Tokenizer.pm.src: Support for case-insensitive XML attribute
	names.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.4 our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181     #$self->{s_kwd}; # state keyword - initialized when used
182     #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194    
195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196     $self->{line_prev} = $self->{line};
197     $self->{column_prev} = $self->{column};
198     $self->{column}++;
199     $self->{nc}
200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201     } else {
202     $self->{set_nc}->($self);
203     }
204    
205     $self->{token} = [];
206     # $self->{escape}
207     } # _initialize_tokenizer
208    
209     ## A token has:
210     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212     ## ->{name} (DOCTYPE_TOKEN)
213     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214     ## ->{pubid} (DOCTYPE_TOKEN)
215     ## ->{sysid} (DOCTYPE_TOKEN)
216     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218     ## ->{name}
219     ## ->{value}
220     ## ->{has_reference} == 1 or 0
221     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224     ## while the token is pushed back to the stack.
225    
226     ## Emitted token MUST immediately be handled by the tree construction state.
227    
228     ## Before each step, UA MAY check to see if either one of the scripts in
229     ## "list of scripts that will execute as soon as possible" or the first
230     ## script in the "list of scripts that will execute asynchronously",
231     ## has completed loading. If one has, then it MUST be executed
232     ## and removed from the list.
233    
234     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235     ## (This requirement was dropped from HTML5 spec, unfortunately.)
236    
237     my $is_space = {
238     0x0009 => 1, # CHARACTER TABULATION (HT)
239     0x000A => 1, # LINE FEED (LF)
240     #0x000B => 0, # LINE TABULATION (VT)
241     0x000C => 1, # FORM FEED (FF)
242     #0x000D => 1, # CARRIAGE RETURN (CR)
243     0x0020 => 1, # SPACE (SP)
244     };
245    
246     sub _get_next_token ($) {
247     my $self = shift;
248    
249     if ($self->{self_closing}) {
250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251     ## NOTE: The |self_closing| flag is only set by start tag token.
252     ## In addition, when a start tag token is emitted, it is always set to
253     ## |ct|.
254     delete $self->{self_closing};
255     }
256    
257     if (@{$self->{token}}) {
258     $self->{self_closing} = $self->{token}->[0]->{self_closing};
259     return shift @{$self->{token}};
260     }
261    
262     A: {
263     if ($self->{state} == PCDATA_STATE) {
264     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265    
266     if ($self->{nc} == 0x0026) { # &
267    
268     ## NOTE: In the spec, the tokenizer is switched to the
269     ## "entity data state". In this implementation, the tokenizer
270     ## is switched to the |ENTITY_STATE|, which is an implementation
271     ## of the "consume a character reference" algorithm.
272     $self->{entity_add} = -1;
273     $self->{prev_state} = DATA_STATE;
274     $self->{state} = ENTITY_STATE;
275    
276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277     $self->{line_prev} = $self->{line};
278     $self->{column_prev} = $self->{column};
279     $self->{column}++;
280     $self->{nc}
281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282     } else {
283     $self->{set_nc}->($self);
284     }
285    
286     redo A;
287     } elsif ($self->{nc} == 0x003C) { # <
288    
289     $self->{state} = TAG_OPEN_STATE;
290    
291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292     $self->{line_prev} = $self->{line};
293     $self->{column_prev} = $self->{column};
294     $self->{column}++;
295     $self->{nc}
296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297     } else {
298     $self->{set_nc}->($self);
299     }
300    
301     redo A;
302     } elsif ($self->{nc} == -1) {
303    
304     return ({type => END_OF_FILE_TOKEN,
305     line => $self->{line}, column => $self->{column}});
306     last A; ## TODO: ok?
307     } else {
308    
309     #
310     }
311    
312     # Anything else
313     my $token = {type => CHARACTER_TOKEN,
314     data => chr $self->{nc},
315     line => $self->{line}, column => $self->{column},
316     };
317     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318    
319     ## Stay in the state.
320    
321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322     $self->{line_prev} = $self->{line};
323     $self->{column_prev} = $self->{column};
324     $self->{column}++;
325     $self->{nc}
326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327     } else {
328     $self->{set_nc}->($self);
329     }
330    
331     return ($token);
332     redo A;
333     } elsif ($self->{state} == DATA_STATE) {
334     $self->{s_kwd} = '' unless defined $self->{s_kwd};
335     if ($self->{nc} == 0x0026) { # &
336     $self->{s_kwd} = '';
337     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338     not $self->{escape}) {
339    
340     ## NOTE: In the spec, the tokenizer is switched to the
341     ## "entity data state". In this implementation, the tokenizer
342     ## is switched to the |ENTITY_STATE|, which is an implementation
343     ## of the "consume a character reference" algorithm.
344     $self->{entity_add} = -1;
345     $self->{prev_state} = DATA_STATE;
346     $self->{state} = ENTITY_STATE;
347    
348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349     $self->{line_prev} = $self->{line};
350     $self->{column_prev} = $self->{column};
351     $self->{column}++;
352     $self->{nc}
353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354     } else {
355     $self->{set_nc}->($self);
356     }
357    
358     redo A;
359     } else {
360    
361     #
362     }
363     } elsif ($self->{nc} == 0x002D) { # -
364     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365     $self->{s_kwd} .= '-';
366    
367     if ($self->{s_kwd} eq '<!--') {
368    
369     $self->{escape} = 1; # unless $self->{escape};
370     $self->{s_kwd} = '--';
371     #
372     } elsif ($self->{s_kwd} eq '---') {
373    
374     $self->{s_kwd} = '--';
375     #
376     } else {
377    
378     #
379     }
380     }
381    
382     #
383     } elsif ($self->{nc} == 0x0021) { # !
384     if (length $self->{s_kwd}) {
385    
386     $self->{s_kwd} .= '!';
387     #
388     } else {
389    
390     #$self->{s_kwd} = '';
391     #
392     }
393     #
394     } elsif ($self->{nc} == 0x003C) { # <
395     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
396     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
397     not $self->{escape})) {
398    
399     $self->{state} = TAG_OPEN_STATE;
400    
401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
402     $self->{line_prev} = $self->{line};
403     $self->{column_prev} = $self->{column};
404     $self->{column}++;
405     $self->{nc}
406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
407     } else {
408     $self->{set_nc}->($self);
409     }
410    
411     redo A;
412     } else {
413    
414     $self->{s_kwd} = '';
415     #
416     }
417     } elsif ($self->{nc} == 0x003E) { # >
418     if ($self->{escape} and
419     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
420     if ($self->{s_kwd} eq '--') {
421    
422     delete $self->{escape};
423     } else {
424    
425     }
426     } else {
427    
428     }
429    
430     $self->{s_kwd} = '';
431     #
432     } elsif ($self->{nc} == -1) {
433    
434     $self->{s_kwd} = '';
435     return ({type => END_OF_FILE_TOKEN,
436     line => $self->{line}, column => $self->{column}});
437     last A; ## TODO: ok?
438     } else {
439    
440     $self->{s_kwd} = '';
441     #
442     }
443    
444     # Anything else
445     my $token = {type => CHARACTER_TOKEN,
446     data => chr $self->{nc},
447     line => $self->{line}, column => $self->{column},
448     };
449     if ($self->{read_until}->($token->{data}, q[-!<>&],
450     length $token->{data})) {
451     $self->{s_kwd} = '';
452     }
453    
454     ## Stay in the data state.
455     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
456    
457     $self->{state} = PCDATA_STATE;
458     } else {
459    
460     ## Stay in the state.
461     }
462    
463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
464     $self->{line_prev} = $self->{line};
465     $self->{column_prev} = $self->{column};
466     $self->{column}++;
467     $self->{nc}
468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
469     } else {
470     $self->{set_nc}->($self);
471     }
472    
473     return ($token);
474     redo A;
475     } elsif ($self->{state} == TAG_OPEN_STATE) {
476     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
477     if ($self->{nc} == 0x002F) { # /
478    
479    
480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
481     $self->{line_prev} = $self->{line};
482     $self->{column_prev} = $self->{column};
483     $self->{column}++;
484     $self->{nc}
485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
486     } else {
487     $self->{set_nc}->($self);
488     }
489    
490     $self->{state} = CLOSE_TAG_OPEN_STATE;
491     redo A;
492     } elsif ($self->{nc} == 0x0021) { # !
493    
494     $self->{s_kwd} = '<' unless $self->{escape};
495     #
496     } else {
497    
498     #
499     }
500    
501     ## reconsume
502     $self->{state} = DATA_STATE;
503     return ({type => CHARACTER_TOKEN, data => '<',
504     line => $self->{line_prev},
505     column => $self->{column_prev},
506     });
507     redo A;
508     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
509     if ($self->{nc} == 0x0021) { # !
510    
511     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
512    
513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
514     $self->{line_prev} = $self->{line};
515     $self->{column_prev} = $self->{column};
516     $self->{column}++;
517     $self->{nc}
518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
519     } else {
520     $self->{set_nc}->($self);
521     }
522    
523     redo A;
524     } elsif ($self->{nc} == 0x002F) { # /
525    
526     $self->{state} = CLOSE_TAG_OPEN_STATE;
527    
528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
529     $self->{line_prev} = $self->{line};
530     $self->{column_prev} = $self->{column};
531     $self->{column}++;
532     $self->{nc}
533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
534     } else {
535     $self->{set_nc}->($self);
536     }
537    
538     redo A;
539     } elsif (0x0041 <= $self->{nc} and
540     $self->{nc} <= 0x005A) { # A..Z
541    
542     $self->{ct}
543     = {type => START_TAG_TOKEN,
544 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
545 wakaba 1.1 line => $self->{line_prev},
546     column => $self->{column_prev}};
547     $self->{state} = TAG_NAME_STATE;
548    
549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
550     $self->{line_prev} = $self->{line};
551     $self->{column_prev} = $self->{column};
552     $self->{column}++;
553     $self->{nc}
554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
555     } else {
556     $self->{set_nc}->($self);
557     }
558    
559     redo A;
560     } elsif (0x0061 <= $self->{nc} and
561     $self->{nc} <= 0x007A) { # a..z
562    
563     $self->{ct} = {type => START_TAG_TOKEN,
564     tag_name => chr ($self->{nc}),
565     line => $self->{line_prev},
566     column => $self->{column_prev}};
567     $self->{state} = TAG_NAME_STATE;
568    
569     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
570     $self->{line_prev} = $self->{line};
571     $self->{column_prev} = $self->{column};
572     $self->{column}++;
573     $self->{nc}
574     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
575     } else {
576     $self->{set_nc}->($self);
577     }
578    
579     redo A;
580     } elsif ($self->{nc} == 0x003E) { # >
581    
582     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
583     line => $self->{line_prev},
584     column => $self->{column_prev});
585     $self->{state} = DATA_STATE;
586    
587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
588     $self->{line_prev} = $self->{line};
589     $self->{column_prev} = $self->{column};
590     $self->{column}++;
591     $self->{nc}
592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
593     } else {
594     $self->{set_nc}->($self);
595     }
596    
597    
598     return ({type => CHARACTER_TOKEN, data => '<>',
599     line => $self->{line_prev},
600     column => $self->{column_prev},
601     });
602    
603     redo A;
604     } elsif ($self->{nc} == 0x003F) { # ?
605    
606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
607     line => $self->{line_prev},
608     column => $self->{column_prev});
609     $self->{state} = BOGUS_COMMENT_STATE;
610     $self->{ct} = {type => COMMENT_TOKEN, data => '',
611     line => $self->{line_prev},
612     column => $self->{column_prev},
613     };
614     ## $self->{nc} is intentionally left as is
615     redo A;
616     } else {
617    
618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
619     line => $self->{line_prev},
620     column => $self->{column_prev});
621     $self->{state} = DATA_STATE;
622     ## reconsume
623    
624     return ({type => CHARACTER_TOKEN, data => '<',
625     line => $self->{line_prev},
626     column => $self->{column_prev},
627     });
628    
629     redo A;
630     }
631     } else {
632     die "$0: $self->{content_model} in tag open";
633     }
634     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
635     ## NOTE: The "close tag open state" in the spec is implemented as
636     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
637    
638     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
639     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640     if (defined $self->{last_stag_name}) {
641     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642     $self->{s_kwd} = '';
643     ## Reconsume.
644     redo A;
645     } else {
646     ## No start tag token has ever been emitted
647     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
648    
649     $self->{state} = DATA_STATE;
650     ## Reconsume.
651     return ({type => CHARACTER_TOKEN, data => '</',
652     line => $l, column => $c,
653     });
654     redo A;
655     }
656     }
657    
658     if (0x0041 <= $self->{nc} and
659     $self->{nc} <= 0x005A) { # A..Z
660    
661     $self->{ct}
662     = {type => END_TAG_TOKEN,
663 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
664 wakaba 1.1 line => $l, column => $c};
665     $self->{state} = TAG_NAME_STATE;
666    
667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668     $self->{line_prev} = $self->{line};
669     $self->{column_prev} = $self->{column};
670     $self->{column}++;
671     $self->{nc}
672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673     } else {
674     $self->{set_nc}->($self);
675     }
676    
677     redo A;
678     } elsif (0x0061 <= $self->{nc} and
679     $self->{nc} <= 0x007A) { # a..z
680    
681     $self->{ct} = {type => END_TAG_TOKEN,
682     tag_name => chr ($self->{nc}),
683     line => $l, column => $c};
684     $self->{state} = TAG_NAME_STATE;
685    
686     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
687     $self->{line_prev} = $self->{line};
688     $self->{column_prev} = $self->{column};
689     $self->{column}++;
690     $self->{nc}
691     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
692     } else {
693     $self->{set_nc}->($self);
694     }
695    
696     redo A;
697     } elsif ($self->{nc} == 0x003E) { # >
698    
699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
700     line => $self->{line_prev}, ## "<" in "</>"
701     column => $self->{column_prev} - 1);
702     $self->{state} = DATA_STATE;
703    
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714     redo A;
715     } elsif ($self->{nc} == -1) {
716    
717     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
718     $self->{state} = DATA_STATE;
719     # reconsume
720    
721     return ({type => CHARACTER_TOKEN, data => '</',
722     line => $l, column => $c,
723     });
724    
725     redo A;
726     } else {
727    
728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
729     $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741     }
742     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
743     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
744     if (length $ch) {
745     my $CH = $ch;
746     $ch =~ tr/a-z/A-Z/;
747     my $nch = chr $self->{nc};
748     if ($nch eq $ch or $nch eq $CH) {
749    
750     ## Stay in the state.
751     $self->{s_kwd} .= $nch;
752    
753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
754     $self->{line_prev} = $self->{line};
755     $self->{column_prev} = $self->{column};
756     $self->{column}++;
757     $self->{nc}
758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
759     } else {
760     $self->{set_nc}->($self);
761     }
762    
763     redo A;
764     } else {
765    
766     $self->{state} = DATA_STATE;
767     ## Reconsume.
768     return ({type => CHARACTER_TOKEN,
769     data => '</' . $self->{s_kwd},
770     line => $self->{line_prev},
771     column => $self->{column_prev} - 1 - length $self->{s_kwd},
772     });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782    
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785     return ({type => CHARACTER_TOKEN,
786     data => '</' . $self->{s_kwd},
787     line => $self->{line_prev},
788     column => $self->{column_prev} - 1 - length $self->{s_kwd},
789     });
790     redo A;
791     } else {
792    
793     $self->{ct}
794     = {type => END_TAG_TOKEN,
795     tag_name => $self->{last_stag_name},
796     line => $self->{line_prev},
797     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
798     $self->{state} = TAG_NAME_STATE;
799     ## Reconsume.
800     redo A;
801     }
802     }
803     } elsif ($self->{state} == TAG_NAME_STATE) {
804     if ($is_space->{$self->{nc}}) {
805    
806     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
807    
808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
809     $self->{line_prev} = $self->{line};
810     $self->{column_prev} = $self->{column};
811     $self->{column}++;
812     $self->{nc}
813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
814     } else {
815     $self->{set_nc}->($self);
816     }
817    
818     redo A;
819     } elsif ($self->{nc} == 0x003E) { # >
820     if ($self->{ct}->{type} == START_TAG_TOKEN) {
821    
822     $self->{last_stag_name} = $self->{ct}->{tag_name};
823     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
824     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
825     #if ($self->{ct}->{attributes}) {
826     # ## NOTE: This should never be reached.
827     # !!! cp (36);
828     # !!! parse-error (type => 'end tag attribute');
829     #} else {
830    
831     #}
832     } else {
833     die "$0: $self->{ct}->{type}: Unknown token type";
834     }
835     $self->{state} = DATA_STATE;
836    
837     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
838     $self->{line_prev} = $self->{line};
839     $self->{column_prev} = $self->{column};
840     $self->{column}++;
841     $self->{nc}
842     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
843     } else {
844     $self->{set_nc}->($self);
845     }
846    
847    
848     return ($self->{ct}); # start tag or end tag
849    
850     redo A;
851     } elsif (0x0041 <= $self->{nc} and
852     $self->{nc} <= 0x005A) { # A..Z
853    
854 wakaba 1.4 $self->{ct}->{tag_name}
855     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
856 wakaba 1.1 # start tag or end tag
857     ## Stay in this state
858    
859     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
860     $self->{line_prev} = $self->{line};
861     $self->{column_prev} = $self->{column};
862     $self->{column}++;
863     $self->{nc}
864     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
865     } else {
866     $self->{set_nc}->($self);
867     }
868    
869     redo A;
870     } elsif ($self->{nc} == -1) {
871     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
872     if ($self->{ct}->{type} == START_TAG_TOKEN) {
873    
874     $self->{last_stag_name} = $self->{ct}->{tag_name};
875     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
876     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
877     #if ($self->{ct}->{attributes}) {
878     # ## NOTE: This state should never be reached.
879     # !!! cp (40);
880     # !!! parse-error (type => 'end tag attribute');
881     #} else {
882    
883     #}
884     } else {
885     die "$0: $self->{ct}->{type}: Unknown token type";
886     }
887     $self->{state} = DATA_STATE;
888     # reconsume
889    
890     return ($self->{ct}); # start tag or end tag
891    
892     redo A;
893     } elsif ($self->{nc} == 0x002F) { # /
894    
895     $self->{state} = SELF_CLOSING_START_TAG_STATE;
896    
897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
898     $self->{line_prev} = $self->{line};
899     $self->{column_prev} = $self->{column};
900     $self->{column}++;
901     $self->{nc}
902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
903     } else {
904     $self->{set_nc}->($self);
905     }
906    
907     redo A;
908     } else {
909    
910     $self->{ct}->{tag_name} .= chr $self->{nc};
911     # start tag or end tag
912     ## Stay in the state
913    
914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
915     $self->{line_prev} = $self->{line};
916     $self->{column_prev} = $self->{column};
917     $self->{column}++;
918     $self->{nc}
919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
920     } else {
921     $self->{set_nc}->($self);
922     }
923    
924     redo A;
925     }
926     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
927     if ($is_space->{$self->{nc}}) {
928    
929     ## Stay in the state
930    
931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
932     $self->{line_prev} = $self->{line};
933     $self->{column_prev} = $self->{column};
934     $self->{column}++;
935     $self->{nc}
936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
937     } else {
938     $self->{set_nc}->($self);
939     }
940    
941     redo A;
942     } elsif ($self->{nc} == 0x003E) { # >
943     if ($self->{ct}->{type} == START_TAG_TOKEN) {
944    
945     $self->{last_stag_name} = $self->{ct}->{tag_name};
946     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
947     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
948     if ($self->{ct}->{attributes}) {
949    
950     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
951     } else {
952    
953     }
954     } else {
955     die "$0: $self->{ct}->{type}: Unknown token type";
956     }
957     $self->{state} = DATA_STATE;
958    
959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
960     $self->{line_prev} = $self->{line};
961     $self->{column_prev} = $self->{column};
962     $self->{column}++;
963     $self->{nc}
964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
965     } else {
966     $self->{set_nc}->($self);
967     }
968    
969    
970     return ($self->{ct}); # start tag or end tag
971    
972     redo A;
973     } elsif (0x0041 <= $self->{nc} and
974     $self->{nc} <= 0x005A) { # A..Z
975    
976     $self->{ca}
977 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
978 wakaba 1.1 value => '',
979     line => $self->{line}, column => $self->{column}};
980     $self->{state} = ATTRIBUTE_NAME_STATE;
981    
982     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
983     $self->{line_prev} = $self->{line};
984     $self->{column_prev} = $self->{column};
985     $self->{column}++;
986     $self->{nc}
987     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
988     } else {
989     $self->{set_nc}->($self);
990     }
991    
992     redo A;
993     } elsif ($self->{nc} == 0x002F) { # /
994    
995     $self->{state} = SELF_CLOSING_START_TAG_STATE;
996    
997     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
998     $self->{line_prev} = $self->{line};
999     $self->{column_prev} = $self->{column};
1000     $self->{column}++;
1001     $self->{nc}
1002     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1003     } else {
1004     $self->{set_nc}->($self);
1005     }
1006    
1007     redo A;
1008     } elsif ($self->{nc} == -1) {
1009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1010     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1011    
1012     $self->{last_stag_name} = $self->{ct}->{tag_name};
1013     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1014     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1015     if ($self->{ct}->{attributes}) {
1016    
1017     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1018     } else {
1019    
1020     }
1021     } else {
1022     die "$0: $self->{ct}->{type}: Unknown token type";
1023     }
1024     $self->{state} = DATA_STATE;
1025     # reconsume
1026    
1027     return ($self->{ct}); # start tag or end tag
1028    
1029     redo A;
1030     } else {
1031     if ({
1032     0x0022 => 1, # "
1033     0x0027 => 1, # '
1034     0x003D => 1, # =
1035     }->{$self->{nc}}) {
1036    
1037     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1038     } else {
1039    
1040     }
1041     $self->{ca}
1042     = {name => chr ($self->{nc}),
1043     value => '',
1044     line => $self->{line}, column => $self->{column}};
1045     $self->{state} = ATTRIBUTE_NAME_STATE;
1046    
1047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1048     $self->{line_prev} = $self->{line};
1049     $self->{column_prev} = $self->{column};
1050     $self->{column}++;
1051     $self->{nc}
1052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1053     } else {
1054     $self->{set_nc}->($self);
1055     }
1056    
1057     redo A;
1058     }
1059     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1060     my $before_leave = sub {
1061     if (exists $self->{ct}->{attributes} # start tag or end tag
1062     ->{$self->{ca}->{name}}) { # MUST
1063    
1064     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1065     ## Discard $self->{ca} # MUST
1066     } else {
1067    
1068     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1069     = $self->{ca};
1070     }
1071     }; # $before_leave
1072    
1073     if ($is_space->{$self->{nc}}) {
1074    
1075     $before_leave->();
1076     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1077    
1078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1079     $self->{line_prev} = $self->{line};
1080     $self->{column_prev} = $self->{column};
1081     $self->{column}++;
1082     $self->{nc}
1083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1084     } else {
1085     $self->{set_nc}->($self);
1086     }
1087    
1088     redo A;
1089     } elsif ($self->{nc} == 0x003D) { # =
1090    
1091     $before_leave->();
1092     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1093    
1094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1095     $self->{line_prev} = $self->{line};
1096     $self->{column_prev} = $self->{column};
1097     $self->{column}++;
1098     $self->{nc}
1099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1100     } else {
1101     $self->{set_nc}->($self);
1102     }
1103    
1104     redo A;
1105     } elsif ($self->{nc} == 0x003E) { # >
1106     $before_leave->();
1107     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1108    
1109     $self->{last_stag_name} = $self->{ct}->{tag_name};
1110     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1111    
1112     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1113     if ($self->{ct}->{attributes}) {
1114     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1115     }
1116     } else {
1117     die "$0: $self->{ct}->{type}: Unknown token type";
1118     }
1119     $self->{state} = DATA_STATE;
1120    
1121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1122     $self->{line_prev} = $self->{line};
1123     $self->{column_prev} = $self->{column};
1124     $self->{column}++;
1125     $self->{nc}
1126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1127     } else {
1128     $self->{set_nc}->($self);
1129     }
1130    
1131    
1132     return ($self->{ct}); # start tag or end tag
1133    
1134     redo A;
1135     } elsif (0x0041 <= $self->{nc} and
1136     $self->{nc} <= 0x005A) { # A..Z
1137    
1138 wakaba 1.4 $self->{ca}->{name}
1139     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1140 wakaba 1.1 ## Stay in the state
1141    
1142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1143     $self->{line_prev} = $self->{line};
1144     $self->{column_prev} = $self->{column};
1145     $self->{column}++;
1146     $self->{nc}
1147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1148     } else {
1149     $self->{set_nc}->($self);
1150     }
1151    
1152     redo A;
1153     } elsif ($self->{nc} == 0x002F) { # /
1154    
1155     $before_leave->();
1156     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1157    
1158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1159     $self->{line_prev} = $self->{line};
1160     $self->{column_prev} = $self->{column};
1161     $self->{column}++;
1162     $self->{nc}
1163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1164     } else {
1165     $self->{set_nc}->($self);
1166     }
1167    
1168     redo A;
1169     } elsif ($self->{nc} == -1) {
1170     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1171     $before_leave->();
1172     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1173    
1174     $self->{last_stag_name} = $self->{ct}->{tag_name};
1175     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1176     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1177     if ($self->{ct}->{attributes}) {
1178    
1179     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1180     } else {
1181     ## NOTE: This state should never be reached.
1182    
1183     }
1184     } else {
1185     die "$0: $self->{ct}->{type}: Unknown token type";
1186     }
1187     $self->{state} = DATA_STATE;
1188     # reconsume
1189    
1190     return ($self->{ct}); # start tag or end tag
1191    
1192     redo A;
1193     } else {
1194     if ($self->{nc} == 0x0022 or # "
1195     $self->{nc} == 0x0027) { # '
1196    
1197     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1198     } else {
1199    
1200     }
1201     $self->{ca}->{name} .= chr ($self->{nc});
1202     ## Stay in the state
1203    
1204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1205     $self->{line_prev} = $self->{line};
1206     $self->{column_prev} = $self->{column};
1207     $self->{column}++;
1208     $self->{nc}
1209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1210     } else {
1211     $self->{set_nc}->($self);
1212     }
1213    
1214     redo A;
1215     }
1216     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1217     if ($is_space->{$self->{nc}}) {
1218    
1219     ## Stay in the state
1220    
1221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1222     $self->{line_prev} = $self->{line};
1223     $self->{column_prev} = $self->{column};
1224     $self->{column}++;
1225     $self->{nc}
1226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1227     } else {
1228     $self->{set_nc}->($self);
1229     }
1230    
1231     redo A;
1232     } elsif ($self->{nc} == 0x003D) { # =
1233    
1234     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1235    
1236     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1237     $self->{line_prev} = $self->{line};
1238     $self->{column_prev} = $self->{column};
1239     $self->{column}++;
1240     $self->{nc}
1241     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1242     } else {
1243     $self->{set_nc}->($self);
1244     }
1245    
1246     redo A;
1247     } elsif ($self->{nc} == 0x003E) { # >
1248     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1249    
1250     $self->{last_stag_name} = $self->{ct}->{tag_name};
1251     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1252     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1253     if ($self->{ct}->{attributes}) {
1254    
1255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1256     } else {
1257     ## NOTE: This state should never be reached.
1258    
1259     }
1260     } else {
1261     die "$0: $self->{ct}->{type}: Unknown token type";
1262     }
1263     $self->{state} = DATA_STATE;
1264    
1265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1266     $self->{line_prev} = $self->{line};
1267     $self->{column_prev} = $self->{column};
1268     $self->{column}++;
1269     $self->{nc}
1270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1271     } else {
1272     $self->{set_nc}->($self);
1273     }
1274    
1275    
1276     return ($self->{ct}); # start tag or end tag
1277    
1278     redo A;
1279     } elsif (0x0041 <= $self->{nc} and
1280     $self->{nc} <= 0x005A) { # A..Z
1281    
1282     $self->{ca}
1283 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1284 wakaba 1.1 value => '',
1285     line => $self->{line}, column => $self->{column}};
1286     $self->{state} = ATTRIBUTE_NAME_STATE;
1287    
1288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1289     $self->{line_prev} = $self->{line};
1290     $self->{column_prev} = $self->{column};
1291     $self->{column}++;
1292     $self->{nc}
1293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1294     } else {
1295     $self->{set_nc}->($self);
1296     }
1297    
1298     redo A;
1299     } elsif ($self->{nc} == 0x002F) { # /
1300    
1301     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1302    
1303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1304     $self->{line_prev} = $self->{line};
1305     $self->{column_prev} = $self->{column};
1306     $self->{column}++;
1307     $self->{nc}
1308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1309     } else {
1310     $self->{set_nc}->($self);
1311     }
1312    
1313     redo A;
1314     } elsif ($self->{nc} == -1) {
1315     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1316     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1317    
1318     $self->{last_stag_name} = $self->{ct}->{tag_name};
1319     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1320     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1321     if ($self->{ct}->{attributes}) {
1322    
1323     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1324     } else {
1325     ## NOTE: This state should never be reached.
1326    
1327     }
1328     } else {
1329     die "$0: $self->{ct}->{type}: Unknown token type";
1330     }
1331     $self->{state} = DATA_STATE;
1332     # reconsume
1333    
1334     return ($self->{ct}); # start tag or end tag
1335    
1336     redo A;
1337     } else {
1338     if ($self->{nc} == 0x0022 or # "
1339     $self->{nc} == 0x0027) { # '
1340    
1341     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1342     } else {
1343    
1344     }
1345     $self->{ca}
1346     = {name => chr ($self->{nc}),
1347     value => '',
1348     line => $self->{line}, column => $self->{column}};
1349     $self->{state} = ATTRIBUTE_NAME_STATE;
1350    
1351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1352     $self->{line_prev} = $self->{line};
1353     $self->{column_prev} = $self->{column};
1354     $self->{column}++;
1355     $self->{nc}
1356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1357     } else {
1358     $self->{set_nc}->($self);
1359     }
1360    
1361     redo A;
1362     }
1363     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1364     if ($is_space->{$self->{nc}}) {
1365    
1366     ## Stay in the state
1367    
1368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1369     $self->{line_prev} = $self->{line};
1370     $self->{column_prev} = $self->{column};
1371     $self->{column}++;
1372     $self->{nc}
1373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1374     } else {
1375     $self->{set_nc}->($self);
1376     }
1377    
1378     redo A;
1379     } elsif ($self->{nc} == 0x0022) { # "
1380    
1381     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1382    
1383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1384     $self->{line_prev} = $self->{line};
1385     $self->{column_prev} = $self->{column};
1386     $self->{column}++;
1387     $self->{nc}
1388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1389     } else {
1390     $self->{set_nc}->($self);
1391     }
1392    
1393     redo A;
1394     } elsif ($self->{nc} == 0x0026) { # &
1395    
1396     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1397     ## reconsume
1398     redo A;
1399     } elsif ($self->{nc} == 0x0027) { # '
1400    
1401     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1402    
1403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1404     $self->{line_prev} = $self->{line};
1405     $self->{column_prev} = $self->{column};
1406     $self->{column}++;
1407     $self->{nc}
1408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1409     } else {
1410     $self->{set_nc}->($self);
1411     }
1412    
1413     redo A;
1414     } elsif ($self->{nc} == 0x003E) { # >
1415     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1416     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1417    
1418     $self->{last_stag_name} = $self->{ct}->{tag_name};
1419     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1420     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1421     if ($self->{ct}->{attributes}) {
1422    
1423     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1424     } else {
1425     ## NOTE: This state should never be reached.
1426    
1427     }
1428     } else {
1429     die "$0: $self->{ct}->{type}: Unknown token type";
1430     }
1431     $self->{state} = DATA_STATE;
1432    
1433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1434     $self->{line_prev} = $self->{line};
1435     $self->{column_prev} = $self->{column};
1436     $self->{column}++;
1437     $self->{nc}
1438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1439     } else {
1440     $self->{set_nc}->($self);
1441     }
1442    
1443    
1444     return ($self->{ct}); # start tag or end tag
1445    
1446     redo A;
1447     } elsif ($self->{nc} == -1) {
1448     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1449     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1450    
1451     $self->{last_stag_name} = $self->{ct}->{tag_name};
1452     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1453     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1454     if ($self->{ct}->{attributes}) {
1455    
1456     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1457     } else {
1458     ## NOTE: This state should never be reached.
1459    
1460     }
1461     } else {
1462     die "$0: $self->{ct}->{type}: Unknown token type";
1463     }
1464     $self->{state} = DATA_STATE;
1465     ## reconsume
1466    
1467     return ($self->{ct}); # start tag or end tag
1468    
1469     redo A;
1470     } else {
1471     if ($self->{nc} == 0x003D) { # =
1472    
1473     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1474     } else {
1475    
1476     }
1477     $self->{ca}->{value} .= chr ($self->{nc});
1478     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1479    
1480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1481     $self->{line_prev} = $self->{line};
1482     $self->{column_prev} = $self->{column};
1483     $self->{column}++;
1484     $self->{nc}
1485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1486     } else {
1487     $self->{set_nc}->($self);
1488     }
1489    
1490     redo A;
1491     }
1492     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1493     if ($self->{nc} == 0x0022) { # "
1494    
1495     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1496    
1497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1498     $self->{line_prev} = $self->{line};
1499     $self->{column_prev} = $self->{column};
1500     $self->{column}++;
1501     $self->{nc}
1502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1503     } else {
1504     $self->{set_nc}->($self);
1505     }
1506    
1507     redo A;
1508     } elsif ($self->{nc} == 0x0026) { # &
1509    
1510     ## NOTE: In the spec, the tokenizer is switched to the
1511     ## "entity in attribute value state". In this implementation, the
1512     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1513     ## implementation of the "consume a character reference" algorithm.
1514     $self->{prev_state} = $self->{state};
1515     $self->{entity_add} = 0x0022; # "
1516     $self->{state} = ENTITY_STATE;
1517    
1518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1519     $self->{line_prev} = $self->{line};
1520     $self->{column_prev} = $self->{column};
1521     $self->{column}++;
1522     $self->{nc}
1523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1524     } else {
1525     $self->{set_nc}->($self);
1526     }
1527    
1528     redo A;
1529     } elsif ($self->{nc} == -1) {
1530     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1531     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1532    
1533     $self->{last_stag_name} = $self->{ct}->{tag_name};
1534     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1535     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1536     if ($self->{ct}->{attributes}) {
1537    
1538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1539     } else {
1540     ## NOTE: This state should never be reached.
1541    
1542     }
1543     } else {
1544     die "$0: $self->{ct}->{type}: Unknown token type";
1545     }
1546     $self->{state} = DATA_STATE;
1547     ## reconsume
1548    
1549     return ($self->{ct}); # start tag or end tag
1550    
1551     redo A;
1552     } else {
1553    
1554     $self->{ca}->{value} .= chr ($self->{nc});
1555     $self->{read_until}->($self->{ca}->{value},
1556     q["&],
1557     length $self->{ca}->{value});
1558    
1559     ## Stay in the state
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     }
1573     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1574     if ($self->{nc} == 0x0027) { # '
1575    
1576     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1577    
1578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1579     $self->{line_prev} = $self->{line};
1580     $self->{column_prev} = $self->{column};
1581     $self->{column}++;
1582     $self->{nc}
1583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1584     } else {
1585     $self->{set_nc}->($self);
1586     }
1587    
1588     redo A;
1589     } elsif ($self->{nc} == 0x0026) { # &
1590    
1591     ## NOTE: In the spec, the tokenizer is switched to the
1592     ## "entity in attribute value state". In this implementation, the
1593     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1594     ## implementation of the "consume a character reference" algorithm.
1595     $self->{entity_add} = 0x0027; # '
1596     $self->{prev_state} = $self->{state};
1597     $self->{state} = ENTITY_STATE;
1598    
1599     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1600     $self->{line_prev} = $self->{line};
1601     $self->{column_prev} = $self->{column};
1602     $self->{column}++;
1603     $self->{nc}
1604     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1605     } else {
1606     $self->{set_nc}->($self);
1607     }
1608    
1609     redo A;
1610     } elsif ($self->{nc} == -1) {
1611     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1612     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1613    
1614     $self->{last_stag_name} = $self->{ct}->{tag_name};
1615     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1616     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1617     if ($self->{ct}->{attributes}) {
1618    
1619     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1620     } else {
1621     ## NOTE: This state should never be reached.
1622    
1623     }
1624     } else {
1625     die "$0: $self->{ct}->{type}: Unknown token type";
1626     }
1627     $self->{state} = DATA_STATE;
1628     ## reconsume
1629    
1630     return ($self->{ct}); # start tag or end tag
1631    
1632     redo A;
1633     } else {
1634    
1635     $self->{ca}->{value} .= chr ($self->{nc});
1636     $self->{read_until}->($self->{ca}->{value},
1637     q['&],
1638     length $self->{ca}->{value});
1639    
1640     ## Stay in the state
1641    
1642     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1643     $self->{line_prev} = $self->{line};
1644     $self->{column_prev} = $self->{column};
1645     $self->{column}++;
1646     $self->{nc}
1647     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1648     } else {
1649     $self->{set_nc}->($self);
1650     }
1651    
1652     redo A;
1653     }
1654     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1655     if ($is_space->{$self->{nc}}) {
1656    
1657     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1658    
1659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1660     $self->{line_prev} = $self->{line};
1661     $self->{column_prev} = $self->{column};
1662     $self->{column}++;
1663     $self->{nc}
1664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1665     } else {
1666     $self->{set_nc}->($self);
1667     }
1668    
1669     redo A;
1670     } elsif ($self->{nc} == 0x0026) { # &
1671    
1672     ## NOTE: In the spec, the tokenizer is switched to the
1673     ## "entity in attribute value state". In this implementation, the
1674     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1675     ## implementation of the "consume a character reference" algorithm.
1676     $self->{entity_add} = -1;
1677     $self->{prev_state} = $self->{state};
1678     $self->{state} = ENTITY_STATE;
1679    
1680     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1681     $self->{line_prev} = $self->{line};
1682     $self->{column_prev} = $self->{column};
1683     $self->{column}++;
1684     $self->{nc}
1685     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1686     } else {
1687     $self->{set_nc}->($self);
1688     }
1689    
1690     redo A;
1691     } elsif ($self->{nc} == 0x003E) { # >
1692     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1693    
1694     $self->{last_stag_name} = $self->{ct}->{tag_name};
1695     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1696     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1697     if ($self->{ct}->{attributes}) {
1698    
1699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1700     } else {
1701     ## NOTE: This state should never be reached.
1702    
1703     }
1704     } else {
1705     die "$0: $self->{ct}->{type}: Unknown token type";
1706     }
1707     $self->{state} = DATA_STATE;
1708    
1709     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1710     $self->{line_prev} = $self->{line};
1711     $self->{column_prev} = $self->{column};
1712     $self->{column}++;
1713     $self->{nc}
1714     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1715     } else {
1716     $self->{set_nc}->($self);
1717     }
1718    
1719    
1720     return ($self->{ct}); # start tag or end tag
1721    
1722     redo A;
1723     } elsif ($self->{nc} == -1) {
1724     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1725     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1726    
1727     $self->{last_stag_name} = $self->{ct}->{tag_name};
1728     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1729     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1730     if ($self->{ct}->{attributes}) {
1731    
1732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1733     } else {
1734     ## NOTE: This state should never be reached.
1735    
1736     }
1737     } else {
1738     die "$0: $self->{ct}->{type}: Unknown token type";
1739     }
1740     $self->{state} = DATA_STATE;
1741     ## reconsume
1742    
1743     return ($self->{ct}); # start tag or end tag
1744    
1745     redo A;
1746     } else {
1747     if ({
1748     0x0022 => 1, # "
1749     0x0027 => 1, # '
1750     0x003D => 1, # =
1751     }->{$self->{nc}}) {
1752    
1753     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1754     } else {
1755    
1756     }
1757     $self->{ca}->{value} .= chr ($self->{nc});
1758     $self->{read_until}->($self->{ca}->{value},
1759     q["'=& >],
1760     length $self->{ca}->{value});
1761    
1762     ## Stay in the state
1763    
1764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1765     $self->{line_prev} = $self->{line};
1766     $self->{column_prev} = $self->{column};
1767     $self->{column}++;
1768     $self->{nc}
1769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1770     } else {
1771     $self->{set_nc}->($self);
1772     }
1773    
1774     redo A;
1775     }
1776     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1777     if ($is_space->{$self->{nc}}) {
1778    
1779     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1780    
1781     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1782     $self->{line_prev} = $self->{line};
1783     $self->{column_prev} = $self->{column};
1784     $self->{column}++;
1785     $self->{nc}
1786     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1787     } else {
1788     $self->{set_nc}->($self);
1789     }
1790    
1791     redo A;
1792     } elsif ($self->{nc} == 0x003E) { # >
1793     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1794    
1795     $self->{last_stag_name} = $self->{ct}->{tag_name};
1796     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1797     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1798     if ($self->{ct}->{attributes}) {
1799    
1800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1801     } else {
1802     ## NOTE: This state should never be reached.
1803    
1804     }
1805     } else {
1806     die "$0: $self->{ct}->{type}: Unknown token type";
1807     }
1808     $self->{state} = DATA_STATE;
1809    
1810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1811     $self->{line_prev} = $self->{line};
1812     $self->{column_prev} = $self->{column};
1813     $self->{column}++;
1814     $self->{nc}
1815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1816     } else {
1817     $self->{set_nc}->($self);
1818     }
1819    
1820    
1821     return ($self->{ct}); # start tag or end tag
1822    
1823     redo A;
1824     } elsif ($self->{nc} == 0x002F) { # /
1825    
1826     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1827    
1828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1829     $self->{line_prev} = $self->{line};
1830     $self->{column_prev} = $self->{column};
1831     $self->{column}++;
1832     $self->{nc}
1833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1834     } else {
1835     $self->{set_nc}->($self);
1836     }
1837    
1838     redo A;
1839     } elsif ($self->{nc} == -1) {
1840     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1841     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842    
1843     $self->{last_stag_name} = $self->{ct}->{tag_name};
1844     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845     if ($self->{ct}->{attributes}) {
1846    
1847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1848     } else {
1849     ## NOTE: This state should never be reached.
1850    
1851     }
1852     } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     $self->{state} = DATA_STATE;
1856     ## Reconsume.
1857     return ($self->{ct}); # start tag or end tag
1858     redo A;
1859     } else {
1860    
1861     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1862     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1863     ## reconsume
1864     redo A;
1865     }
1866     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1867     if ($self->{nc} == 0x003E) { # >
1868     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1869    
1870     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1871     ## TODO: Different type than slash in start tag
1872     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1873     if ($self->{ct}->{attributes}) {
1874    
1875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1876     } else {
1877    
1878     }
1879     ## TODO: Test |<title></title/>|
1880     } else {
1881    
1882     $self->{self_closing} = 1;
1883     }
1884    
1885     $self->{state} = DATA_STATE;
1886    
1887     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1888     $self->{line_prev} = $self->{line};
1889     $self->{column_prev} = $self->{column};
1890     $self->{column}++;
1891     $self->{nc}
1892     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1893     } else {
1894     $self->{set_nc}->($self);
1895     }
1896    
1897    
1898     return ($self->{ct}); # start tag or end tag
1899    
1900     redo A;
1901     } elsif ($self->{nc} == -1) {
1902     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1903     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1904    
1905     $self->{last_stag_name} = $self->{ct}->{tag_name};
1906     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1907     if ($self->{ct}->{attributes}) {
1908    
1909     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1910     } else {
1911     ## NOTE: This state should never be reached.
1912    
1913     }
1914     } else {
1915     die "$0: $self->{ct}->{type}: Unknown token type";
1916     }
1917     $self->{state} = DATA_STATE;
1918     ## Reconsume.
1919     return ($self->{ct}); # start tag or end tag
1920     redo A;
1921     } else {
1922    
1923     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1924     ## TODO: This error type is wrong.
1925     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1926     ## Reconsume.
1927     redo A;
1928     }
1929     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1930     ## (only happen if PCDATA state)
1931    
1932     ## NOTE: Unlike spec's "bogus comment state", this implementation
1933     ## consumes characters one-by-one basis.
1934    
1935     if ($self->{nc} == 0x003E) { # >
1936    
1937     $self->{state} = DATA_STATE;
1938    
1939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1940     $self->{line_prev} = $self->{line};
1941     $self->{column_prev} = $self->{column};
1942     $self->{column}++;
1943     $self->{nc}
1944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1945     } else {
1946     $self->{set_nc}->($self);
1947     }
1948    
1949    
1950     return ($self->{ct}); # comment
1951     redo A;
1952     } elsif ($self->{nc} == -1) {
1953    
1954     $self->{state} = DATA_STATE;
1955     ## reconsume
1956    
1957     return ($self->{ct}); # comment
1958     redo A;
1959     } else {
1960    
1961     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1962     $self->{read_until}->($self->{ct}->{data},
1963     q[>],
1964     length $self->{ct}->{data});
1965    
1966     ## Stay in the state.
1967    
1968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1969     $self->{line_prev} = $self->{line};
1970     $self->{column_prev} = $self->{column};
1971     $self->{column}++;
1972     $self->{nc}
1973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1974     } else {
1975     $self->{set_nc}->($self);
1976     }
1977    
1978     redo A;
1979     }
1980     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1981     ## (only happen if PCDATA state)
1982    
1983     if ($self->{nc} == 0x002D) { # -
1984    
1985     $self->{state} = MD_HYPHEN_STATE;
1986    
1987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1988     $self->{line_prev} = $self->{line};
1989     $self->{column_prev} = $self->{column};
1990     $self->{column}++;
1991     $self->{nc}
1992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1993     } else {
1994     $self->{set_nc}->($self);
1995     }
1996    
1997     redo A;
1998     } elsif ($self->{nc} == 0x0044 or # D
1999     $self->{nc} == 0x0064) { # d
2000     ## ASCII case-insensitive.
2001    
2002     $self->{state} = MD_DOCTYPE_STATE;
2003     $self->{s_kwd} = chr $self->{nc};
2004    
2005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2006     $self->{line_prev} = $self->{line};
2007     $self->{column_prev} = $self->{column};
2008     $self->{column}++;
2009     $self->{nc}
2010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2011     } else {
2012     $self->{set_nc}->($self);
2013     }
2014    
2015     redo A;
2016 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2017     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2018     $self->{is_xml}) and
2019 wakaba 1.1 $self->{nc} == 0x005B) { # [
2020    
2021     $self->{state} = MD_CDATA_STATE;
2022     $self->{s_kwd} = '[';
2023    
2024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2025     $self->{line_prev} = $self->{line};
2026     $self->{column_prev} = $self->{column};
2027     $self->{column}++;
2028     $self->{nc}
2029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2030     } else {
2031     $self->{set_nc}->($self);
2032     }
2033    
2034     redo A;
2035     } else {
2036    
2037     }
2038    
2039     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2040     line => $self->{line_prev},
2041     column => $self->{column_prev} - 1);
2042     ## Reconsume.
2043     $self->{state} = BOGUS_COMMENT_STATE;
2044     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2045     line => $self->{line_prev},
2046     column => $self->{column_prev} - 1,
2047     };
2048     redo A;
2049     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2050     if ($self->{nc} == 0x002D) { # -
2051    
2052     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2053     line => $self->{line_prev},
2054     column => $self->{column_prev} - 2,
2055     };
2056     $self->{state} = COMMENT_START_STATE;
2057    
2058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2059     $self->{line_prev} = $self->{line};
2060     $self->{column_prev} = $self->{column};
2061     $self->{column}++;
2062     $self->{nc}
2063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2064     } else {
2065     $self->{set_nc}->($self);
2066     }
2067    
2068     redo A;
2069     } else {
2070    
2071     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2072     line => $self->{line_prev},
2073     column => $self->{column_prev} - 2);
2074     $self->{state} = BOGUS_COMMENT_STATE;
2075     ## Reconsume.
2076     $self->{ct} = {type => COMMENT_TOKEN,
2077     data => '-',
2078     line => $self->{line_prev},
2079     column => $self->{column_prev} - 2,
2080     };
2081     redo A;
2082     }
2083     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2084     ## ASCII case-insensitive.
2085     if ($self->{nc} == [
2086     undef,
2087     0x004F, # O
2088     0x0043, # C
2089     0x0054, # T
2090     0x0059, # Y
2091     0x0050, # P
2092     ]->[length $self->{s_kwd}] or
2093     $self->{nc} == [
2094     undef,
2095     0x006F, # o
2096     0x0063, # c
2097     0x0074, # t
2098     0x0079, # y
2099     0x0070, # p
2100     ]->[length $self->{s_kwd}]) {
2101    
2102     ## Stay in the state.
2103     $self->{s_kwd} .= chr $self->{nc};
2104    
2105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2106     $self->{line_prev} = $self->{line};
2107     $self->{column_prev} = $self->{column};
2108     $self->{column}++;
2109     $self->{nc}
2110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2111     } else {
2112     $self->{set_nc}->($self);
2113     }
2114    
2115     redo A;
2116     } elsif ((length $self->{s_kwd}) == 6 and
2117     ($self->{nc} == 0x0045 or # E
2118     $self->{nc} == 0x0065)) { # e
2119    
2120     $self->{state} = DOCTYPE_STATE;
2121     $self->{ct} = {type => DOCTYPE_TOKEN,
2122     quirks => 1,
2123     line => $self->{line_prev},
2124     column => $self->{column_prev} - 7,
2125     };
2126    
2127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2128     $self->{line_prev} = $self->{line};
2129     $self->{column_prev} = $self->{column};
2130     $self->{column}++;
2131     $self->{nc}
2132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2133     } else {
2134     $self->{set_nc}->($self);
2135     }
2136    
2137     redo A;
2138     } else {
2139    
2140     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2141     line => $self->{line_prev},
2142     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2143     $self->{state} = BOGUS_COMMENT_STATE;
2144     ## Reconsume.
2145     $self->{ct} = {type => COMMENT_TOKEN,
2146     data => $self->{s_kwd},
2147     line => $self->{line_prev},
2148     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2149     };
2150     redo A;
2151     }
2152     } elsif ($self->{state} == MD_CDATA_STATE) {
2153     if ($self->{nc} == {
2154     '[' => 0x0043, # C
2155     '[C' => 0x0044, # D
2156     '[CD' => 0x0041, # A
2157     '[CDA' => 0x0054, # T
2158     '[CDAT' => 0x0041, # A
2159     }->{$self->{s_kwd}}) {
2160    
2161     ## Stay in the state.
2162     $self->{s_kwd} .= chr $self->{nc};
2163    
2164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2165     $self->{line_prev} = $self->{line};
2166     $self->{column_prev} = $self->{column};
2167     $self->{column}++;
2168     $self->{nc}
2169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2170     } else {
2171     $self->{set_nc}->($self);
2172     }
2173    
2174     redo A;
2175     } elsif ($self->{s_kwd} eq '[CDATA' and
2176     $self->{nc} == 0x005B) { # [
2177    
2178     $self->{ct} = {type => CHARACTER_TOKEN,
2179     data => '',
2180     line => $self->{line_prev},
2181     column => $self->{column_prev} - 7};
2182     $self->{state} = CDATA_SECTION_STATE;
2183    
2184     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2185     $self->{line_prev} = $self->{line};
2186     $self->{column_prev} = $self->{column};
2187     $self->{column}++;
2188     $self->{nc}
2189     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2190     } else {
2191     $self->{set_nc}->($self);
2192     }
2193    
2194     redo A;
2195     } else {
2196    
2197     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2198     line => $self->{line_prev},
2199     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2200     $self->{state} = BOGUS_COMMENT_STATE;
2201     ## Reconsume.
2202     $self->{ct} = {type => COMMENT_TOKEN,
2203     data => $self->{s_kwd},
2204     line => $self->{line_prev},
2205     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2206     };
2207     redo A;
2208     }
2209     } elsif ($self->{state} == COMMENT_START_STATE) {
2210     if ($self->{nc} == 0x002D) { # -
2211    
2212     $self->{state} = COMMENT_START_DASH_STATE;
2213    
2214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2215     $self->{line_prev} = $self->{line};
2216     $self->{column_prev} = $self->{column};
2217     $self->{column}++;
2218     $self->{nc}
2219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2220     } else {
2221     $self->{set_nc}->($self);
2222     }
2223    
2224     redo A;
2225     } elsif ($self->{nc} == 0x003E) { # >
2226    
2227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2228     $self->{state} = DATA_STATE;
2229    
2230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2231     $self->{line_prev} = $self->{line};
2232     $self->{column_prev} = $self->{column};
2233     $self->{column}++;
2234     $self->{nc}
2235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2236     } else {
2237     $self->{set_nc}->($self);
2238     }
2239    
2240    
2241     return ($self->{ct}); # comment
2242    
2243     redo A;
2244     } elsif ($self->{nc} == -1) {
2245    
2246     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2247     $self->{state} = DATA_STATE;
2248     ## reconsume
2249    
2250     return ($self->{ct}); # comment
2251    
2252     redo A;
2253     } else {
2254    
2255     $self->{ct}->{data} # comment
2256     .= chr ($self->{nc});
2257     $self->{state} = COMMENT_STATE;
2258    
2259     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2260     $self->{line_prev} = $self->{line};
2261     $self->{column_prev} = $self->{column};
2262     $self->{column}++;
2263     $self->{nc}
2264     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2265     } else {
2266     $self->{set_nc}->($self);
2267     }
2268    
2269     redo A;
2270     }
2271     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2272     if ($self->{nc} == 0x002D) { # -
2273    
2274     $self->{state} = COMMENT_END_STATE;
2275    
2276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2277     $self->{line_prev} = $self->{line};
2278     $self->{column_prev} = $self->{column};
2279     $self->{column}++;
2280     $self->{nc}
2281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2282     } else {
2283     $self->{set_nc}->($self);
2284     }
2285    
2286     redo A;
2287     } elsif ($self->{nc} == 0x003E) { # >
2288    
2289     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2290     $self->{state} = DATA_STATE;
2291    
2292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2293     $self->{line_prev} = $self->{line};
2294     $self->{column_prev} = $self->{column};
2295     $self->{column}++;
2296     $self->{nc}
2297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2298     } else {
2299     $self->{set_nc}->($self);
2300     }
2301    
2302    
2303     return ($self->{ct}); # comment
2304    
2305     redo A;
2306     } elsif ($self->{nc} == -1) {
2307    
2308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2309     $self->{state} = DATA_STATE;
2310     ## reconsume
2311    
2312     return ($self->{ct}); # comment
2313    
2314     redo A;
2315     } else {
2316    
2317     $self->{ct}->{data} # comment
2318     .= '-' . chr ($self->{nc});
2319     $self->{state} = COMMENT_STATE;
2320    
2321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2322     $self->{line_prev} = $self->{line};
2323     $self->{column_prev} = $self->{column};
2324     $self->{column}++;
2325     $self->{nc}
2326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2327     } else {
2328     $self->{set_nc}->($self);
2329     }
2330    
2331     redo A;
2332     }
2333     } elsif ($self->{state} == COMMENT_STATE) {
2334     if ($self->{nc} == 0x002D) { # -
2335    
2336     $self->{state} = COMMENT_END_DASH_STATE;
2337    
2338     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339     $self->{line_prev} = $self->{line};
2340     $self->{column_prev} = $self->{column};
2341     $self->{column}++;
2342     $self->{nc}
2343     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344     } else {
2345     $self->{set_nc}->($self);
2346     }
2347    
2348     redo A;
2349     } elsif ($self->{nc} == -1) {
2350    
2351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2352     $self->{state} = DATA_STATE;
2353     ## reconsume
2354    
2355     return ($self->{ct}); # comment
2356    
2357     redo A;
2358     } else {
2359    
2360     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2361     $self->{read_until}->($self->{ct}->{data},
2362     q[-],
2363     length $self->{ct}->{data});
2364    
2365     ## Stay in the state
2366    
2367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2368     $self->{line_prev} = $self->{line};
2369     $self->{column_prev} = $self->{column};
2370     $self->{column}++;
2371     $self->{nc}
2372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2373     } else {
2374     $self->{set_nc}->($self);
2375     }
2376    
2377     redo A;
2378     }
2379     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2380     if ($self->{nc} == 0x002D) { # -
2381    
2382     $self->{state} = COMMENT_END_STATE;
2383    
2384     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2385     $self->{line_prev} = $self->{line};
2386     $self->{column_prev} = $self->{column};
2387     $self->{column}++;
2388     $self->{nc}
2389     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2390     } else {
2391     $self->{set_nc}->($self);
2392     }
2393    
2394     redo A;
2395     } elsif ($self->{nc} == -1) {
2396    
2397     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2398     $self->{state} = DATA_STATE;
2399     ## reconsume
2400    
2401     return ($self->{ct}); # comment
2402    
2403     redo A;
2404     } else {
2405    
2406     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2407     $self->{state} = COMMENT_STATE;
2408    
2409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410     $self->{line_prev} = $self->{line};
2411     $self->{column_prev} = $self->{column};
2412     $self->{column}++;
2413     $self->{nc}
2414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415     } else {
2416     $self->{set_nc}->($self);
2417     }
2418    
2419     redo A;
2420     }
2421     } elsif ($self->{state} == COMMENT_END_STATE) {
2422     if ($self->{nc} == 0x003E) { # >
2423    
2424     $self->{state} = DATA_STATE;
2425    
2426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427     $self->{line_prev} = $self->{line};
2428     $self->{column_prev} = $self->{column};
2429     $self->{column}++;
2430     $self->{nc}
2431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432     } else {
2433     $self->{set_nc}->($self);
2434     }
2435    
2436    
2437     return ($self->{ct}); # comment
2438    
2439     redo A;
2440     } elsif ($self->{nc} == 0x002D) { # -
2441    
2442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2443     line => $self->{line_prev},
2444     column => $self->{column_prev});
2445     $self->{ct}->{data} .= '-'; # comment
2446     ## Stay in the state
2447    
2448     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2449     $self->{line_prev} = $self->{line};
2450     $self->{column_prev} = $self->{column};
2451     $self->{column}++;
2452     $self->{nc}
2453     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2454     } else {
2455     $self->{set_nc}->($self);
2456     }
2457    
2458     redo A;
2459     } elsif ($self->{nc} == -1) {
2460    
2461     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2462     $self->{state} = DATA_STATE;
2463     ## reconsume
2464    
2465     return ($self->{ct}); # comment
2466    
2467     redo A;
2468     } else {
2469    
2470     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2471     line => $self->{line_prev},
2472     column => $self->{column_prev});
2473     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2474     $self->{state} = COMMENT_STATE;
2475    
2476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2477     $self->{line_prev} = $self->{line};
2478     $self->{column_prev} = $self->{column};
2479     $self->{column}++;
2480     $self->{nc}
2481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2482     } else {
2483     $self->{set_nc}->($self);
2484     }
2485    
2486     redo A;
2487     }
2488     } elsif ($self->{state} == DOCTYPE_STATE) {
2489     if ($is_space->{$self->{nc}}) {
2490    
2491     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2492    
2493     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2494     $self->{line_prev} = $self->{line};
2495     $self->{column_prev} = $self->{column};
2496     $self->{column}++;
2497     $self->{nc}
2498     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2499     } else {
2500     $self->{set_nc}->($self);
2501     }
2502    
2503     redo A;
2504     } else {
2505    
2506     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2507     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2508     ## reconsume
2509     redo A;
2510     }
2511     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2512     if ($is_space->{$self->{nc}}) {
2513    
2514     ## Stay in the state
2515    
2516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2517     $self->{line_prev} = $self->{line};
2518     $self->{column_prev} = $self->{column};
2519     $self->{column}++;
2520     $self->{nc}
2521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2522     } else {
2523     $self->{set_nc}->($self);
2524     }
2525    
2526     redo A;
2527     } elsif ($self->{nc} == 0x003E) { # >
2528    
2529     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2530     $self->{state} = DATA_STATE;
2531    
2532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2533     $self->{line_prev} = $self->{line};
2534     $self->{column_prev} = $self->{column};
2535     $self->{column}++;
2536     $self->{nc}
2537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2538     } else {
2539     $self->{set_nc}->($self);
2540     }
2541    
2542    
2543     return ($self->{ct}); # DOCTYPE (quirks)
2544    
2545     redo A;
2546     } elsif ($self->{nc} == -1) {
2547    
2548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2549     $self->{state} = DATA_STATE;
2550     ## reconsume
2551    
2552     return ($self->{ct}); # DOCTYPE (quirks)
2553    
2554     redo A;
2555     } else {
2556    
2557     $self->{ct}->{name} = chr $self->{nc};
2558     delete $self->{ct}->{quirks};
2559     $self->{state} = DOCTYPE_NAME_STATE;
2560    
2561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562     $self->{line_prev} = $self->{line};
2563     $self->{column_prev} = $self->{column};
2564     $self->{column}++;
2565     $self->{nc}
2566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2567     } else {
2568     $self->{set_nc}->($self);
2569     }
2570    
2571     redo A;
2572     }
2573     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2574     ## ISSUE: Redundant "First," in the spec.
2575     if ($is_space->{$self->{nc}}) {
2576    
2577     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2578    
2579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580     $self->{line_prev} = $self->{line};
2581     $self->{column_prev} = $self->{column};
2582     $self->{column}++;
2583     $self->{nc}
2584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2585     } else {
2586     $self->{set_nc}->($self);
2587     }
2588    
2589     redo A;
2590     } elsif ($self->{nc} == 0x003E) { # >
2591    
2592     $self->{state} = DATA_STATE;
2593    
2594     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2595     $self->{line_prev} = $self->{line};
2596     $self->{column_prev} = $self->{column};
2597     $self->{column}++;
2598     $self->{nc}
2599     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2600     } else {
2601     $self->{set_nc}->($self);
2602     }
2603    
2604    
2605     return ($self->{ct}); # DOCTYPE
2606    
2607     redo A;
2608     } elsif ($self->{nc} == -1) {
2609    
2610     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2611     $self->{state} = DATA_STATE;
2612     ## reconsume
2613    
2614     $self->{ct}->{quirks} = 1;
2615     return ($self->{ct}); # DOCTYPE
2616    
2617     redo A;
2618     } else {
2619    
2620     $self->{ct}->{name}
2621     .= chr ($self->{nc}); # DOCTYPE
2622     ## Stay in the state
2623    
2624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2625     $self->{line_prev} = $self->{line};
2626     $self->{column_prev} = $self->{column};
2627     $self->{column}++;
2628     $self->{nc}
2629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2630     } else {
2631     $self->{set_nc}->($self);
2632     }
2633    
2634     redo A;
2635     }
2636     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2637     if ($is_space->{$self->{nc}}) {
2638    
2639     ## Stay in the state
2640    
2641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2642     $self->{line_prev} = $self->{line};
2643     $self->{column_prev} = $self->{column};
2644     $self->{column}++;
2645     $self->{nc}
2646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2647     } else {
2648     $self->{set_nc}->($self);
2649     }
2650    
2651     redo A;
2652     } elsif ($self->{nc} == 0x003E) { # >
2653    
2654     $self->{state} = DATA_STATE;
2655    
2656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657     $self->{line_prev} = $self->{line};
2658     $self->{column_prev} = $self->{column};
2659     $self->{column}++;
2660     $self->{nc}
2661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662     } else {
2663     $self->{set_nc}->($self);
2664     }
2665    
2666    
2667     return ($self->{ct}); # DOCTYPE
2668    
2669     redo A;
2670     } elsif ($self->{nc} == -1) {
2671    
2672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2673     $self->{state} = DATA_STATE;
2674     ## reconsume
2675    
2676     $self->{ct}->{quirks} = 1;
2677     return ($self->{ct}); # DOCTYPE
2678    
2679     redo A;
2680     } elsif ($self->{nc} == 0x0050 or # P
2681     $self->{nc} == 0x0070) { # p
2682     $self->{state} = PUBLIC_STATE;
2683     $self->{s_kwd} = chr $self->{nc};
2684    
2685     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2686     $self->{line_prev} = $self->{line};
2687     $self->{column_prev} = $self->{column};
2688     $self->{column}++;
2689     $self->{nc}
2690     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2691     } else {
2692     $self->{set_nc}->($self);
2693     }
2694    
2695     redo A;
2696     } elsif ($self->{nc} == 0x0053 or # S
2697     $self->{nc} == 0x0073) { # s
2698     $self->{state} = SYSTEM_STATE;
2699     $self->{s_kwd} = chr $self->{nc};
2700    
2701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2702     $self->{line_prev} = $self->{line};
2703     $self->{column_prev} = $self->{column};
2704     $self->{column}++;
2705     $self->{nc}
2706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2707     } else {
2708     $self->{set_nc}->($self);
2709     }
2710    
2711     redo A;
2712     } else {
2713    
2714     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2715     $self->{ct}->{quirks} = 1;
2716    
2717     $self->{state} = BOGUS_DOCTYPE_STATE;
2718    
2719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2720     $self->{line_prev} = $self->{line};
2721     $self->{column_prev} = $self->{column};
2722     $self->{column}++;
2723     $self->{nc}
2724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2725     } else {
2726     $self->{set_nc}->($self);
2727     }
2728    
2729     redo A;
2730     }
2731     } elsif ($self->{state} == PUBLIC_STATE) {
2732     ## ASCII case-insensitive
2733     if ($self->{nc} == [
2734     undef,
2735     0x0055, # U
2736     0x0042, # B
2737     0x004C, # L
2738     0x0049, # I
2739     ]->[length $self->{s_kwd}] or
2740     $self->{nc} == [
2741     undef,
2742     0x0075, # u
2743     0x0062, # b
2744     0x006C, # l
2745     0x0069, # i
2746     ]->[length $self->{s_kwd}]) {
2747    
2748     ## Stay in the state.
2749     $self->{s_kwd} .= chr $self->{nc};
2750    
2751     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2752     $self->{line_prev} = $self->{line};
2753     $self->{column_prev} = $self->{column};
2754     $self->{column}++;
2755     $self->{nc}
2756     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2757     } else {
2758     $self->{set_nc}->($self);
2759     }
2760    
2761     redo A;
2762     } elsif ((length $self->{s_kwd}) == 5 and
2763     ($self->{nc} == 0x0043 or # C
2764     $self->{nc} == 0x0063)) { # c
2765    
2766     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2767    
2768     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2769     $self->{line_prev} = $self->{line};
2770     $self->{column_prev} = $self->{column};
2771     $self->{column}++;
2772     $self->{nc}
2773     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2774     } else {
2775     $self->{set_nc}->($self);
2776     }
2777    
2778     redo A;
2779     } else {
2780    
2781     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2782     line => $self->{line_prev},
2783     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2784     $self->{ct}->{quirks} = 1;
2785    
2786     $self->{state} = BOGUS_DOCTYPE_STATE;
2787     ## Reconsume.
2788     redo A;
2789     }
2790     } elsif ($self->{state} == SYSTEM_STATE) {
2791     ## ASCII case-insensitive
2792     if ($self->{nc} == [
2793     undef,
2794     0x0059, # Y
2795     0x0053, # S
2796     0x0054, # T
2797     0x0045, # E
2798     ]->[length $self->{s_kwd}] or
2799     $self->{nc} == [
2800     undef,
2801     0x0079, # y
2802     0x0073, # s
2803     0x0074, # t
2804     0x0065, # e
2805     ]->[length $self->{s_kwd}]) {
2806    
2807     ## Stay in the state.
2808     $self->{s_kwd} .= chr $self->{nc};
2809    
2810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2811     $self->{line_prev} = $self->{line};
2812     $self->{column_prev} = $self->{column};
2813     $self->{column}++;
2814     $self->{nc}
2815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2816     } else {
2817     $self->{set_nc}->($self);
2818     }
2819    
2820     redo A;
2821     } elsif ((length $self->{s_kwd}) == 5 and
2822     ($self->{nc} == 0x004D or # M
2823     $self->{nc} == 0x006D)) { # m
2824    
2825     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2826    
2827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2828     $self->{line_prev} = $self->{line};
2829     $self->{column_prev} = $self->{column};
2830     $self->{column}++;
2831     $self->{nc}
2832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2833     } else {
2834     $self->{set_nc}->($self);
2835     }
2836    
2837     redo A;
2838     } else {
2839    
2840     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2841     line => $self->{line_prev},
2842     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2843     $self->{ct}->{quirks} = 1;
2844    
2845     $self->{state} = BOGUS_DOCTYPE_STATE;
2846     ## Reconsume.
2847     redo A;
2848     }
2849     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2850     if ($is_space->{$self->{nc}}) {
2851    
2852     ## Stay in the state
2853    
2854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2855     $self->{line_prev} = $self->{line};
2856     $self->{column_prev} = $self->{column};
2857     $self->{column}++;
2858     $self->{nc}
2859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2860     } else {
2861     $self->{set_nc}->($self);
2862     }
2863    
2864     redo A;
2865     } elsif ($self->{nc} eq 0x0022) { # "
2866    
2867     $self->{ct}->{pubid} = ''; # DOCTYPE
2868     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2869    
2870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2871     $self->{line_prev} = $self->{line};
2872     $self->{column_prev} = $self->{column};
2873     $self->{column}++;
2874     $self->{nc}
2875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2876     } else {
2877     $self->{set_nc}->($self);
2878     }
2879    
2880     redo A;
2881     } elsif ($self->{nc} eq 0x0027) { # '
2882    
2883     $self->{ct}->{pubid} = ''; # DOCTYPE
2884     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2885    
2886     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2887     $self->{line_prev} = $self->{line};
2888     $self->{column_prev} = $self->{column};
2889     $self->{column}++;
2890     $self->{nc}
2891     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2892     } else {
2893     $self->{set_nc}->($self);
2894     }
2895    
2896     redo A;
2897     } elsif ($self->{nc} eq 0x003E) { # >
2898    
2899     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2900    
2901     $self->{state} = DATA_STATE;
2902    
2903     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2904     $self->{line_prev} = $self->{line};
2905     $self->{column_prev} = $self->{column};
2906     $self->{column}++;
2907     $self->{nc}
2908     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2909     } else {
2910     $self->{set_nc}->($self);
2911     }
2912    
2913    
2914     $self->{ct}->{quirks} = 1;
2915     return ($self->{ct}); # DOCTYPE
2916    
2917     redo A;
2918     } elsif ($self->{nc} == -1) {
2919    
2920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2921    
2922     $self->{state} = DATA_STATE;
2923     ## reconsume
2924    
2925     $self->{ct}->{quirks} = 1;
2926     return ($self->{ct}); # DOCTYPE
2927    
2928     redo A;
2929     } else {
2930    
2931     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2932     $self->{ct}->{quirks} = 1;
2933    
2934     $self->{state} = BOGUS_DOCTYPE_STATE;
2935    
2936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2937     $self->{line_prev} = $self->{line};
2938     $self->{column_prev} = $self->{column};
2939     $self->{column}++;
2940     $self->{nc}
2941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2942     } else {
2943     $self->{set_nc}->($self);
2944     }
2945    
2946     redo A;
2947     }
2948     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2949     if ($self->{nc} == 0x0022) { # "
2950    
2951     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2952    
2953     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2954     $self->{line_prev} = $self->{line};
2955     $self->{column_prev} = $self->{column};
2956     $self->{column}++;
2957     $self->{nc}
2958     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2959     } else {
2960     $self->{set_nc}->($self);
2961     }
2962    
2963     redo A;
2964     } elsif ($self->{nc} == 0x003E) { # >
2965    
2966     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2967    
2968     $self->{state} = DATA_STATE;
2969    
2970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2971     $self->{line_prev} = $self->{line};
2972     $self->{column_prev} = $self->{column};
2973     $self->{column}++;
2974     $self->{nc}
2975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2976     } else {
2977     $self->{set_nc}->($self);
2978     }
2979    
2980    
2981     $self->{ct}->{quirks} = 1;
2982     return ($self->{ct}); # DOCTYPE
2983    
2984     redo A;
2985     } elsif ($self->{nc} == -1) {
2986    
2987     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2988    
2989     $self->{state} = DATA_STATE;
2990     ## reconsume
2991    
2992     $self->{ct}->{quirks} = 1;
2993     return ($self->{ct}); # DOCTYPE
2994    
2995     redo A;
2996     } else {
2997    
2998     $self->{ct}->{pubid} # DOCTYPE
2999     .= chr $self->{nc};
3000     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3001     length $self->{ct}->{pubid});
3002    
3003     ## Stay in the state
3004    
3005     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3006     $self->{line_prev} = $self->{line};
3007     $self->{column_prev} = $self->{column};
3008     $self->{column}++;
3009     $self->{nc}
3010     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3011     } else {
3012     $self->{set_nc}->($self);
3013     }
3014    
3015     redo A;
3016     }
3017     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3018     if ($self->{nc} == 0x0027) { # '
3019    
3020     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3021    
3022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3023     $self->{line_prev} = $self->{line};
3024     $self->{column_prev} = $self->{column};
3025     $self->{column}++;
3026     $self->{nc}
3027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3028     } else {
3029     $self->{set_nc}->($self);
3030     }
3031    
3032     redo A;
3033     } elsif ($self->{nc} == 0x003E) { # >
3034    
3035     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3036    
3037     $self->{state} = DATA_STATE;
3038    
3039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3040     $self->{line_prev} = $self->{line};
3041     $self->{column_prev} = $self->{column};
3042     $self->{column}++;
3043     $self->{nc}
3044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3045     } else {
3046     $self->{set_nc}->($self);
3047     }
3048    
3049    
3050     $self->{ct}->{quirks} = 1;
3051     return ($self->{ct}); # DOCTYPE
3052    
3053     redo A;
3054     } elsif ($self->{nc} == -1) {
3055    
3056     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3057    
3058     $self->{state} = DATA_STATE;
3059     ## reconsume
3060    
3061     $self->{ct}->{quirks} = 1;
3062     return ($self->{ct}); # DOCTYPE
3063    
3064     redo A;
3065     } else {
3066    
3067     $self->{ct}->{pubid} # DOCTYPE
3068     .= chr $self->{nc};
3069     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3070     length $self->{ct}->{pubid});
3071    
3072     ## Stay in the state
3073    
3074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3075     $self->{line_prev} = $self->{line};
3076     $self->{column_prev} = $self->{column};
3077     $self->{column}++;
3078     $self->{nc}
3079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3080     } else {
3081     $self->{set_nc}->($self);
3082     }
3083    
3084     redo A;
3085     }
3086     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3087     if ($is_space->{$self->{nc}}) {
3088    
3089     ## Stay in the state
3090    
3091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3092     $self->{line_prev} = $self->{line};
3093     $self->{column_prev} = $self->{column};
3094     $self->{column}++;
3095     $self->{nc}
3096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3097     } else {
3098     $self->{set_nc}->($self);
3099     }
3100    
3101     redo A;
3102     } elsif ($self->{nc} == 0x0022) { # "
3103    
3104     $self->{ct}->{sysid} = ''; # DOCTYPE
3105     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3106    
3107     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3108     $self->{line_prev} = $self->{line};
3109     $self->{column_prev} = $self->{column};
3110     $self->{column}++;
3111     $self->{nc}
3112     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3113     } else {
3114     $self->{set_nc}->($self);
3115     }
3116    
3117     redo A;
3118     } elsif ($self->{nc} == 0x0027) { # '
3119    
3120     $self->{ct}->{sysid} = ''; # DOCTYPE
3121     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3122    
3123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3124     $self->{line_prev} = $self->{line};
3125     $self->{column_prev} = $self->{column};
3126     $self->{column}++;
3127     $self->{nc}
3128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3129     } else {
3130     $self->{set_nc}->($self);
3131     }
3132    
3133     redo A;
3134     } elsif ($self->{nc} == 0x003E) { # >
3135    
3136     $self->{state} = DATA_STATE;
3137    
3138     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3139     $self->{line_prev} = $self->{line};
3140     $self->{column_prev} = $self->{column};
3141     $self->{column}++;
3142     $self->{nc}
3143     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3144     } else {
3145     $self->{set_nc}->($self);
3146     }
3147    
3148    
3149     return ($self->{ct}); # DOCTYPE
3150    
3151     redo A;
3152     } elsif ($self->{nc} == -1) {
3153    
3154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3155    
3156     $self->{state} = DATA_STATE;
3157     ## reconsume
3158    
3159     $self->{ct}->{quirks} = 1;
3160     return ($self->{ct}); # DOCTYPE
3161    
3162     redo A;
3163     } else {
3164    
3165     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3166     $self->{ct}->{quirks} = 1;
3167    
3168     $self->{state} = BOGUS_DOCTYPE_STATE;
3169    
3170     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171     $self->{line_prev} = $self->{line};
3172     $self->{column_prev} = $self->{column};
3173     $self->{column}++;
3174     $self->{nc}
3175     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176     } else {
3177     $self->{set_nc}->($self);
3178     }
3179    
3180     redo A;
3181     }
3182     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3183     if ($is_space->{$self->{nc}}) {
3184    
3185     ## Stay in the state
3186    
3187     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3188     $self->{line_prev} = $self->{line};
3189     $self->{column_prev} = $self->{column};
3190     $self->{column}++;
3191     $self->{nc}
3192     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3193     } else {
3194     $self->{set_nc}->($self);
3195     }
3196    
3197     redo A;
3198     } elsif ($self->{nc} == 0x0022) { # "
3199    
3200     $self->{ct}->{sysid} = ''; # DOCTYPE
3201     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3202    
3203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3204     $self->{line_prev} = $self->{line};
3205     $self->{column_prev} = $self->{column};
3206     $self->{column}++;
3207     $self->{nc}
3208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3209     } else {
3210     $self->{set_nc}->($self);
3211     }
3212    
3213     redo A;
3214     } elsif ($self->{nc} == 0x0027) { # '
3215    
3216     $self->{ct}->{sysid} = ''; # DOCTYPE
3217     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3218    
3219     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3220     $self->{line_prev} = $self->{line};
3221     $self->{column_prev} = $self->{column};
3222     $self->{column}++;
3223     $self->{nc}
3224     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3225     } else {
3226     $self->{set_nc}->($self);
3227     }
3228    
3229     redo A;
3230     } elsif ($self->{nc} == 0x003E) { # >
3231    
3232     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3233     $self->{state} = DATA_STATE;
3234    
3235     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3236     $self->{line_prev} = $self->{line};
3237     $self->{column_prev} = $self->{column};
3238     $self->{column}++;
3239     $self->{nc}
3240     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3241     } else {
3242     $self->{set_nc}->($self);
3243     }
3244    
3245    
3246     $self->{ct}->{quirks} = 1;
3247     return ($self->{ct}); # DOCTYPE
3248    
3249     redo A;
3250     } elsif ($self->{nc} == -1) {
3251    
3252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3253    
3254     $self->{state} = DATA_STATE;
3255     ## reconsume
3256    
3257     $self->{ct}->{quirks} = 1;
3258     return ($self->{ct}); # DOCTYPE
3259    
3260     redo A;
3261     } else {
3262    
3263     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3264     $self->{ct}->{quirks} = 1;
3265    
3266     $self->{state} = BOGUS_DOCTYPE_STATE;
3267    
3268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3269     $self->{line_prev} = $self->{line};
3270     $self->{column_prev} = $self->{column};
3271     $self->{column}++;
3272     $self->{nc}
3273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3274     } else {
3275     $self->{set_nc}->($self);
3276     }
3277    
3278     redo A;
3279     }
3280     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3281     if ($self->{nc} == 0x0022) { # "
3282    
3283     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3284    
3285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3286     $self->{line_prev} = $self->{line};
3287     $self->{column_prev} = $self->{column};
3288     $self->{column}++;
3289     $self->{nc}
3290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3291     } else {
3292     $self->{set_nc}->($self);
3293     }
3294    
3295     redo A;
3296     } elsif ($self->{nc} == 0x003E) { # >
3297    
3298     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3299    
3300     $self->{state} = DATA_STATE;
3301    
3302     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3303     $self->{line_prev} = $self->{line};
3304     $self->{column_prev} = $self->{column};
3305     $self->{column}++;
3306     $self->{nc}
3307     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3308     } else {
3309     $self->{set_nc}->($self);
3310     }
3311    
3312    
3313     $self->{ct}->{quirks} = 1;
3314     return ($self->{ct}); # DOCTYPE
3315    
3316     redo A;
3317     } elsif ($self->{nc} == -1) {
3318    
3319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3320    
3321     $self->{state} = DATA_STATE;
3322     ## reconsume
3323    
3324     $self->{ct}->{quirks} = 1;
3325     return ($self->{ct}); # DOCTYPE
3326    
3327     redo A;
3328     } else {
3329    
3330     $self->{ct}->{sysid} # DOCTYPE
3331     .= chr $self->{nc};
3332     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3333     length $self->{ct}->{sysid});
3334    
3335     ## Stay in the state
3336    
3337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3338     $self->{line_prev} = $self->{line};
3339     $self->{column_prev} = $self->{column};
3340     $self->{column}++;
3341     $self->{nc}
3342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3343     } else {
3344     $self->{set_nc}->($self);
3345     }
3346    
3347     redo A;
3348     }
3349     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3350     if ($self->{nc} == 0x0027) { # '
3351    
3352     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3353    
3354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3355     $self->{line_prev} = $self->{line};
3356     $self->{column_prev} = $self->{column};
3357     $self->{column}++;
3358     $self->{nc}
3359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3360     } else {
3361     $self->{set_nc}->($self);
3362     }
3363    
3364     redo A;
3365     } elsif ($self->{nc} == 0x003E) { # >
3366    
3367     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3368    
3369     $self->{state} = DATA_STATE;
3370    
3371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3372     $self->{line_prev} = $self->{line};
3373     $self->{column_prev} = $self->{column};
3374     $self->{column}++;
3375     $self->{nc}
3376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3377     } else {
3378     $self->{set_nc}->($self);
3379     }
3380    
3381    
3382     $self->{ct}->{quirks} = 1;
3383     return ($self->{ct}); # DOCTYPE
3384    
3385     redo A;
3386     } elsif ($self->{nc} == -1) {
3387    
3388     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3389    
3390     $self->{state} = DATA_STATE;
3391     ## reconsume
3392    
3393     $self->{ct}->{quirks} = 1;
3394     return ($self->{ct}); # DOCTYPE
3395    
3396     redo A;
3397     } else {
3398    
3399     $self->{ct}->{sysid} # DOCTYPE
3400     .= chr $self->{nc};
3401     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3402     length $self->{ct}->{sysid});
3403    
3404     ## Stay in the state
3405    
3406     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3407     $self->{line_prev} = $self->{line};
3408     $self->{column_prev} = $self->{column};
3409     $self->{column}++;
3410     $self->{nc}
3411     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3412     } else {
3413     $self->{set_nc}->($self);
3414     }
3415    
3416     redo A;
3417     }
3418     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3419     if ($is_space->{$self->{nc}}) {
3420    
3421     ## Stay in the state
3422    
3423     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3424     $self->{line_prev} = $self->{line};
3425     $self->{column_prev} = $self->{column};
3426     $self->{column}++;
3427     $self->{nc}
3428     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3429     } else {
3430     $self->{set_nc}->($self);
3431     }
3432    
3433     redo A;
3434     } elsif ($self->{nc} == 0x003E) { # >
3435    
3436     $self->{state} = DATA_STATE;
3437    
3438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3439     $self->{line_prev} = $self->{line};
3440     $self->{column_prev} = $self->{column};
3441     $self->{column}++;
3442     $self->{nc}
3443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3444     } else {
3445     $self->{set_nc}->($self);
3446     }
3447    
3448    
3449     return ($self->{ct}); # DOCTYPE
3450    
3451     redo A;
3452     } elsif ($self->{nc} == -1) {
3453    
3454     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3455     $self->{state} = DATA_STATE;
3456     ## reconsume
3457    
3458     $self->{ct}->{quirks} = 1;
3459     return ($self->{ct}); # DOCTYPE
3460    
3461     redo A;
3462     } else {
3463    
3464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3465     #$self->{ct}->{quirks} = 1;
3466    
3467     $self->{state} = BOGUS_DOCTYPE_STATE;
3468    
3469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3470     $self->{line_prev} = $self->{line};
3471     $self->{column_prev} = $self->{column};
3472     $self->{column}++;
3473     $self->{nc}
3474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3475     } else {
3476     $self->{set_nc}->($self);
3477     }
3478    
3479     redo A;
3480     }
3481     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3482     if ($self->{nc} == 0x003E) { # >
3483    
3484     $self->{state} = DATA_STATE;
3485    
3486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3487     $self->{line_prev} = $self->{line};
3488     $self->{column_prev} = $self->{column};
3489     $self->{column}++;
3490     $self->{nc}
3491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3492     } else {
3493     $self->{set_nc}->($self);
3494     }
3495    
3496    
3497     return ($self->{ct}); # DOCTYPE
3498    
3499     redo A;
3500     } elsif ($self->{nc} == -1) {
3501    
3502     $self->{state} = DATA_STATE;
3503     ## reconsume
3504    
3505     return ($self->{ct}); # DOCTYPE
3506    
3507     redo A;
3508     } else {
3509    
3510     my $s = '';
3511     $self->{read_until}->($s, q[>], 0);
3512    
3513     ## Stay in the state
3514    
3515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3516     $self->{line_prev} = $self->{line};
3517     $self->{column_prev} = $self->{column};
3518     $self->{column}++;
3519     $self->{nc}
3520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3521     } else {
3522     $self->{set_nc}->($self);
3523     }
3524    
3525     redo A;
3526     }
3527     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3528     ## NOTE: "CDATA section state" in the state is jointly implemented
3529     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3530     ## and |CDATA_SECTION_MSE2_STATE|.
3531    
3532     if ($self->{nc} == 0x005D) { # ]
3533    
3534     $self->{state} = CDATA_SECTION_MSE1_STATE;
3535    
3536     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3537     $self->{line_prev} = $self->{line};
3538     $self->{column_prev} = $self->{column};
3539     $self->{column}++;
3540     $self->{nc}
3541     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3542     } else {
3543     $self->{set_nc}->($self);
3544     }
3545    
3546     redo A;
3547     } elsif ($self->{nc} == -1) {
3548     $self->{state} = DATA_STATE;
3549    
3550     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3551     $self->{line_prev} = $self->{line};
3552     $self->{column_prev} = $self->{column};
3553     $self->{column}++;
3554     $self->{nc}
3555     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3556     } else {
3557     $self->{set_nc}->($self);
3558     }
3559    
3560     if (length $self->{ct}->{data}) { # character
3561    
3562     return ($self->{ct}); # character
3563     } else {
3564    
3565     ## No token to emit. $self->{ct} is discarded.
3566     }
3567     redo A;
3568     } else {
3569    
3570     $self->{ct}->{data} .= chr $self->{nc};
3571     $self->{read_until}->($self->{ct}->{data},
3572     q<]>,
3573     length $self->{ct}->{data});
3574    
3575     ## Stay in the state.
3576    
3577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3578     $self->{line_prev} = $self->{line};
3579     $self->{column_prev} = $self->{column};
3580     $self->{column}++;
3581     $self->{nc}
3582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3583     } else {
3584     $self->{set_nc}->($self);
3585     }
3586    
3587     redo A;
3588     }
3589    
3590     ## ISSUE: "text tokens" in spec.
3591     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3592     if ($self->{nc} == 0x005D) { # ]
3593    
3594     $self->{state} = CDATA_SECTION_MSE2_STATE;
3595    
3596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597     $self->{line_prev} = $self->{line};
3598     $self->{column_prev} = $self->{column};
3599     $self->{column}++;
3600     $self->{nc}
3601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3602     } else {
3603     $self->{set_nc}->($self);
3604     }
3605    
3606     redo A;
3607     } else {
3608    
3609     $self->{ct}->{data} .= ']';
3610     $self->{state} = CDATA_SECTION_STATE;
3611     ## Reconsume.
3612     redo A;
3613     }
3614     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3615     if ($self->{nc} == 0x003E) { # >
3616     $self->{state} = DATA_STATE;
3617    
3618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3619     $self->{line_prev} = $self->{line};
3620     $self->{column_prev} = $self->{column};
3621     $self->{column}++;
3622     $self->{nc}
3623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3624     } else {
3625     $self->{set_nc}->($self);
3626     }
3627    
3628     if (length $self->{ct}->{data}) { # character
3629    
3630     return ($self->{ct}); # character
3631     } else {
3632    
3633     ## No token to emit. $self->{ct} is discarded.
3634     }
3635     redo A;
3636     } elsif ($self->{nc} == 0x005D) { # ]
3637     # character
3638     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3639     ## Stay in the state.
3640    
3641     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642     $self->{line_prev} = $self->{line};
3643     $self->{column_prev} = $self->{column};
3644     $self->{column}++;
3645     $self->{nc}
3646     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647     } else {
3648     $self->{set_nc}->($self);
3649     }
3650    
3651     redo A;
3652     } else {
3653    
3654     $self->{ct}->{data} .= ']]'; # character
3655     $self->{state} = CDATA_SECTION_STATE;
3656     ## Reconsume.
3657     redo A;
3658     }
3659     } elsif ($self->{state} == ENTITY_STATE) {
3660     if ($is_space->{$self->{nc}} or
3661     {
3662     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3663     $self->{entity_add} => 1,
3664     }->{$self->{nc}}) {
3665    
3666     ## Don't consume
3667     ## No error
3668     ## Return nothing.
3669     #
3670     } elsif ($self->{nc} == 0x0023) { # #
3671    
3672     $self->{state} = ENTITY_HASH_STATE;
3673     $self->{s_kwd} = '#';
3674    
3675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3676     $self->{line_prev} = $self->{line};
3677     $self->{column_prev} = $self->{column};
3678     $self->{column}++;
3679     $self->{nc}
3680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3681     } else {
3682     $self->{set_nc}->($self);
3683     }
3684    
3685     redo A;
3686     } elsif ((0x0041 <= $self->{nc} and
3687     $self->{nc} <= 0x005A) or # A..Z
3688     (0x0061 <= $self->{nc} and
3689     $self->{nc} <= 0x007A)) { # a..z
3690    
3691     require Whatpm::_NamedEntityList;
3692     $self->{state} = ENTITY_NAME_STATE;
3693     $self->{s_kwd} = chr $self->{nc};
3694     $self->{entity__value} = $self->{s_kwd};
3695     $self->{entity__match} = 0;
3696    
3697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3698     $self->{line_prev} = $self->{line};
3699     $self->{column_prev} = $self->{column};
3700     $self->{column}++;
3701     $self->{nc}
3702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3703     } else {
3704     $self->{set_nc}->($self);
3705     }
3706    
3707     redo A;
3708     } else {
3709    
3710     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3711     ## Return nothing.
3712     #
3713     }
3714    
3715     ## NOTE: No character is consumed by the "consume a character
3716     ## reference" algorithm. In other word, there is an "&" character
3717     ## that does not introduce a character reference, which would be
3718     ## appended to the parent element or the attribute value in later
3719     ## process of the tokenizer.
3720    
3721     if ($self->{prev_state} == DATA_STATE) {
3722    
3723     $self->{state} = $self->{prev_state};
3724     ## Reconsume.
3725     return ({type => CHARACTER_TOKEN, data => '&',
3726     line => $self->{line_prev},
3727     column => $self->{column_prev},
3728     });
3729     redo A;
3730     } else {
3731    
3732     $self->{ca}->{value} .= '&';
3733     $self->{state} = $self->{prev_state};
3734     ## Reconsume.
3735     redo A;
3736     }
3737     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3738     if ($self->{nc} == 0x0078 or # x
3739     $self->{nc} == 0x0058) { # X
3740    
3741     $self->{state} = HEXREF_X_STATE;
3742     $self->{s_kwd} .= chr $self->{nc};
3743    
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754     redo A;
3755     } elsif (0x0030 <= $self->{nc} and
3756     $self->{nc} <= 0x0039) { # 0..9
3757    
3758     $self->{state} = NCR_NUM_STATE;
3759     $self->{s_kwd} = $self->{nc} - 0x0030;
3760    
3761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3762     $self->{line_prev} = $self->{line};
3763     $self->{column_prev} = $self->{column};
3764     $self->{column}++;
3765     $self->{nc}
3766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3767     } else {
3768     $self->{set_nc}->($self);
3769     }
3770    
3771     redo A;
3772     } else {
3773     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3774     line => $self->{line_prev},
3775     column => $self->{column_prev} - 1);
3776    
3777     ## NOTE: According to the spec algorithm, nothing is returned,
3778     ## and then "&#" is appended to the parent element or the attribute
3779     ## value in the later processing.
3780    
3781     if ($self->{prev_state} == DATA_STATE) {
3782    
3783     $self->{state} = $self->{prev_state};
3784     ## Reconsume.
3785     return ({type => CHARACTER_TOKEN,
3786     data => '&#',
3787     line => $self->{line_prev},
3788     column => $self->{column_prev} - 1,
3789     });
3790     redo A;
3791     } else {
3792    
3793     $self->{ca}->{value} .= '&#';
3794     $self->{state} = $self->{prev_state};
3795     ## Reconsume.
3796     redo A;
3797     }
3798     }
3799     } elsif ($self->{state} == NCR_NUM_STATE) {
3800     if (0x0030 <= $self->{nc} and
3801     $self->{nc} <= 0x0039) { # 0..9
3802    
3803     $self->{s_kwd} *= 10;
3804     $self->{s_kwd} += $self->{nc} - 0x0030;
3805    
3806     ## Stay in the state.
3807    
3808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3809     $self->{line_prev} = $self->{line};
3810     $self->{column_prev} = $self->{column};
3811     $self->{column}++;
3812     $self->{nc}
3813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3814     } else {
3815     $self->{set_nc}->($self);
3816     }
3817    
3818     redo A;
3819     } elsif ($self->{nc} == 0x003B) { # ;
3820    
3821    
3822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3823     $self->{line_prev} = $self->{line};
3824     $self->{column_prev} = $self->{column};
3825     $self->{column}++;
3826     $self->{nc}
3827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3828     } else {
3829     $self->{set_nc}->($self);
3830     }
3831    
3832     #
3833     } else {
3834    
3835     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3836     ## Reconsume.
3837     #
3838     }
3839    
3840     my $code = $self->{s_kwd};
3841     my $l = $self->{line_prev};
3842     my $c = $self->{column_prev};
3843     if ($charref_map->{$code}) {
3844    
3845     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3846     text => (sprintf 'U+%04X', $code),
3847     line => $l, column => $c);
3848     $code = $charref_map->{$code};
3849     } elsif ($code > 0x10FFFF) {
3850    
3851     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3852     text => (sprintf 'U-%08X', $code),
3853     line => $l, column => $c);
3854     $code = 0xFFFD;
3855     }
3856    
3857     if ($self->{prev_state} == DATA_STATE) {
3858    
3859     $self->{state} = $self->{prev_state};
3860     ## Reconsume.
3861     return ({type => CHARACTER_TOKEN, data => chr $code,
3862     line => $l, column => $c,
3863     });
3864     redo A;
3865     } else {
3866    
3867     $self->{ca}->{value} .= chr $code;
3868     $self->{ca}->{has_reference} = 1;
3869     $self->{state} = $self->{prev_state};
3870     ## Reconsume.
3871     redo A;
3872     }
3873     } elsif ($self->{state} == HEXREF_X_STATE) {
3874     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3875     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3876     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3877     # 0..9, A..F, a..f
3878    
3879     $self->{state} = HEXREF_HEX_STATE;
3880     $self->{s_kwd} = 0;
3881     ## Reconsume.
3882     redo A;
3883     } else {
3884     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3885     line => $self->{line_prev},
3886     column => $self->{column_prev} - 2);
3887    
3888     ## NOTE: According to the spec algorithm, nothing is returned,
3889     ## and then "&#" followed by "X" or "x" is appended to the parent
3890     ## element or the attribute value in the later processing.
3891    
3892     if ($self->{prev_state} == DATA_STATE) {
3893    
3894     $self->{state} = $self->{prev_state};
3895     ## Reconsume.
3896     return ({type => CHARACTER_TOKEN,
3897     data => '&' . $self->{s_kwd},
3898     line => $self->{line_prev},
3899     column => $self->{column_prev} - length $self->{s_kwd},
3900     });
3901     redo A;
3902     } else {
3903    
3904     $self->{ca}->{value} .= '&' . $self->{s_kwd};
3905     $self->{state} = $self->{prev_state};
3906     ## Reconsume.
3907     redo A;
3908     }
3909     }
3910     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3911     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3912     # 0..9
3913    
3914     $self->{s_kwd} *= 0x10;
3915     $self->{s_kwd} += $self->{nc} - 0x0030;
3916     ## Stay in the state.
3917    
3918     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3919     $self->{line_prev} = $self->{line};
3920     $self->{column_prev} = $self->{column};
3921     $self->{column}++;
3922     $self->{nc}
3923     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3924     } else {
3925     $self->{set_nc}->($self);
3926     }
3927    
3928     redo A;
3929     } elsif (0x0061 <= $self->{nc} and
3930     $self->{nc} <= 0x0066) { # a..f
3931    
3932     $self->{s_kwd} *= 0x10;
3933     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3934     ## Stay in the state.
3935    
3936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3937     $self->{line_prev} = $self->{line};
3938     $self->{column_prev} = $self->{column};
3939     $self->{column}++;
3940     $self->{nc}
3941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3942     } else {
3943     $self->{set_nc}->($self);
3944     }
3945    
3946     redo A;
3947     } elsif (0x0041 <= $self->{nc} and
3948     $self->{nc} <= 0x0046) { # A..F
3949    
3950     $self->{s_kwd} *= 0x10;
3951     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3952     ## Stay in the state.
3953    
3954     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3955     $self->{line_prev} = $self->{line};
3956     $self->{column_prev} = $self->{column};
3957     $self->{column}++;
3958     $self->{nc}
3959     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3960     } else {
3961     $self->{set_nc}->($self);
3962     }
3963    
3964     redo A;
3965     } elsif ($self->{nc} == 0x003B) { # ;
3966    
3967    
3968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3969     $self->{line_prev} = $self->{line};
3970     $self->{column_prev} = $self->{column};
3971     $self->{column}++;
3972     $self->{nc}
3973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3974     } else {
3975     $self->{set_nc}->($self);
3976     }
3977    
3978     #
3979     } else {
3980    
3981     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3982     line => $self->{line},
3983     column => $self->{column});
3984     ## Reconsume.
3985     #
3986     }
3987    
3988     my $code = $self->{s_kwd};
3989     my $l = $self->{line_prev};
3990     my $c = $self->{column_prev};
3991     if ($charref_map->{$code}) {
3992    
3993     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3994     text => (sprintf 'U+%04X', $code),
3995     line => $l, column => $c);
3996     $code = $charref_map->{$code};
3997     } elsif ($code > 0x10FFFF) {
3998    
3999     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4000     text => (sprintf 'U-%08X', $code),
4001     line => $l, column => $c);
4002     $code = 0xFFFD;
4003     }
4004    
4005     if ($self->{prev_state} == DATA_STATE) {
4006    
4007     $self->{state} = $self->{prev_state};
4008     ## Reconsume.
4009     return ({type => CHARACTER_TOKEN, data => chr $code,
4010     line => $l, column => $c,
4011     });
4012     redo A;
4013     } else {
4014    
4015     $self->{ca}->{value} .= chr $code;
4016     $self->{ca}->{has_reference} = 1;
4017     $self->{state} = $self->{prev_state};
4018     ## Reconsume.
4019     redo A;
4020     }
4021     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4022     if (length $self->{s_kwd} < 30 and
4023     ## NOTE: Some number greater than the maximum length of entity name
4024     ((0x0041 <= $self->{nc} and # a
4025     $self->{nc} <= 0x005A) or # x
4026     (0x0061 <= $self->{nc} and # a
4027     $self->{nc} <= 0x007A) or # z
4028     (0x0030 <= $self->{nc} and # 0
4029     $self->{nc} <= 0x0039) or # 9
4030     $self->{nc} == 0x003B)) { # ;
4031     our $EntityChar;
4032     $self->{s_kwd} .= chr $self->{nc};
4033     if (defined $EntityChar->{$self->{s_kwd}}) {
4034     if ($self->{nc} == 0x003B) { # ;
4035    
4036     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4037     $self->{entity__match} = 1;
4038    
4039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4040     $self->{line_prev} = $self->{line};
4041     $self->{column_prev} = $self->{column};
4042     $self->{column}++;
4043     $self->{nc}
4044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4045     } else {
4046     $self->{set_nc}->($self);
4047     }
4048    
4049     #
4050     } else {
4051    
4052     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4053     $self->{entity__match} = -1;
4054     ## Stay in the state.
4055    
4056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4057     $self->{line_prev} = $self->{line};
4058     $self->{column_prev} = $self->{column};
4059     $self->{column}++;
4060     $self->{nc}
4061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4062     } else {
4063     $self->{set_nc}->($self);
4064     }
4065    
4066     redo A;
4067     }
4068     } else {
4069    
4070     $self->{entity__value} .= chr $self->{nc};
4071     $self->{entity__match} *= 2;
4072     ## Stay in the state.
4073    
4074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4075     $self->{line_prev} = $self->{line};
4076     $self->{column_prev} = $self->{column};
4077     $self->{column}++;
4078     $self->{nc}
4079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4080     } else {
4081     $self->{set_nc}->($self);
4082     }
4083    
4084     redo A;
4085     }
4086     }
4087    
4088     my $data;
4089     my $has_ref;
4090     if ($self->{entity__match} > 0) {
4091    
4092     $data = $self->{entity__value};
4093     $has_ref = 1;
4094     #
4095     } elsif ($self->{entity__match} < 0) {
4096     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4097     if ($self->{prev_state} != DATA_STATE and # in attribute
4098     $self->{entity__match} < -1) {
4099    
4100     $data = '&' . $self->{s_kwd};
4101     #
4102     } else {
4103    
4104     $data = $self->{entity__value};
4105     $has_ref = 1;
4106     #
4107     }
4108     } else {
4109    
4110     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4111     line => $self->{line_prev},
4112     column => $self->{column_prev} - length $self->{s_kwd});
4113     $data = '&' . $self->{s_kwd};
4114     #
4115     }
4116    
4117     ## NOTE: In these cases, when a character reference is found,
4118     ## it is consumed and a character token is returned, or, otherwise,
4119     ## nothing is consumed and returned, according to the spec algorithm.
4120     ## In this implementation, anything that has been examined by the
4121     ## tokenizer is appended to the parent element or the attribute value
4122     ## as string, either literal string when no character reference or
4123     ## entity-replaced string otherwise, in this stage, since any characters
4124     ## that would not be consumed are appended in the data state or in an
4125     ## appropriate attribute value state anyway.
4126    
4127     if ($self->{prev_state} == DATA_STATE) {
4128    
4129     $self->{state} = $self->{prev_state};
4130     ## Reconsume.
4131     return ({type => CHARACTER_TOKEN,
4132     data => $data,
4133     line => $self->{line_prev},
4134     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4135     });
4136     redo A;
4137     } else {
4138    
4139     $self->{ca}->{value} .= $data;
4140     $self->{ca}->{has_reference} = 1 if $has_ref;
4141     $self->{state} = $self->{prev_state};
4142     ## Reconsume.
4143     redo A;
4144     }
4145     } else {
4146     die "$0: $self->{state}: Unknown state";
4147     }
4148     } # A
4149    
4150     die "$0: _get_next_token: unexpected case";
4151     } # _get_next_token
4152    
4153     1;
4154 wakaba 1.4 ## $Date: 2008/10/14 05:34:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24