/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download)
Tue Oct 14 14:38:59 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.4: +109 -9 lines
++ whatpm/t/ChangeLog	14 Oct 2008 14:21:51 -0000
	* XML-Parser.t: "xml/texts-1.dat" added.

	* tokenizer-test-2.dat: Test for ]]> are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 14:38:34 -0000
	* doctypes-1.dat: Wrong results fixed.

	* texts-1.dat: New test data file.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 14:22:43 -0000
	* Tokenizer.pm.src: Raise a parse error for XML "]]>" other than
	CDATA section end.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.5 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
182 wakaba 1.1 #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194    
195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196     $self->{line_prev} = $self->{line};
197     $self->{column_prev} = $self->{column};
198     $self->{column}++;
199     $self->{nc}
200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201     } else {
202     $self->{set_nc}->($self);
203     }
204    
205     $self->{token} = [];
206     # $self->{escape}
207     } # _initialize_tokenizer
208    
209     ## A token has:
210     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212     ## ->{name} (DOCTYPE_TOKEN)
213     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214     ## ->{pubid} (DOCTYPE_TOKEN)
215     ## ->{sysid} (DOCTYPE_TOKEN)
216     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218     ## ->{name}
219     ## ->{value}
220     ## ->{has_reference} == 1 or 0
221     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224     ## while the token is pushed back to the stack.
225    
226     ## Emitted token MUST immediately be handled by the tree construction state.
227    
228     ## Before each step, UA MAY check to see if either one of the scripts in
229     ## "list of scripts that will execute as soon as possible" or the first
230     ## script in the "list of scripts that will execute asynchronously",
231     ## has completed loading. If one has, then it MUST be executed
232     ## and removed from the list.
233    
234     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235     ## (This requirement was dropped from HTML5 spec, unfortunately.)
236    
237     my $is_space = {
238     0x0009 => 1, # CHARACTER TABULATION (HT)
239     0x000A => 1, # LINE FEED (LF)
240     #0x000B => 0, # LINE TABULATION (VT)
241     0x000C => 1, # FORM FEED (FF)
242     #0x000D => 1, # CARRIAGE RETURN (CR)
243     0x0020 => 1, # SPACE (SP)
244     };
245    
246     sub _get_next_token ($) {
247     my $self = shift;
248    
249     if ($self->{self_closing}) {
250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251     ## NOTE: The |self_closing| flag is only set by start tag token.
252     ## In addition, when a start tag token is emitted, it is always set to
253     ## |ct|.
254     delete $self->{self_closing};
255     }
256    
257     if (@{$self->{token}}) {
258     $self->{self_closing} = $self->{token}->[0]->{self_closing};
259     return shift @{$self->{token}};
260     }
261    
262     A: {
263     if ($self->{state} == PCDATA_STATE) {
264     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265    
266     if ($self->{nc} == 0x0026) { # &
267    
268     ## NOTE: In the spec, the tokenizer is switched to the
269     ## "entity data state". In this implementation, the tokenizer
270     ## is switched to the |ENTITY_STATE|, which is an implementation
271     ## of the "consume a character reference" algorithm.
272     $self->{entity_add} = -1;
273     $self->{prev_state} = DATA_STATE;
274     $self->{state} = ENTITY_STATE;
275    
276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277     $self->{line_prev} = $self->{line};
278     $self->{column_prev} = $self->{column};
279     $self->{column}++;
280     $self->{nc}
281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282     } else {
283     $self->{set_nc}->($self);
284     }
285    
286     redo A;
287     } elsif ($self->{nc} == 0x003C) { # <
288    
289     $self->{state} = TAG_OPEN_STATE;
290    
291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292     $self->{line_prev} = $self->{line};
293     $self->{column_prev} = $self->{column};
294     $self->{column}++;
295     $self->{nc}
296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297     } else {
298     $self->{set_nc}->($self);
299     }
300    
301     redo A;
302     } elsif ($self->{nc} == -1) {
303    
304     return ({type => END_OF_FILE_TOKEN,
305     line => $self->{line}, column => $self->{column}});
306     last A; ## TODO: ok?
307     } else {
308    
309     #
310     }
311    
312     # Anything else
313     my $token = {type => CHARACTER_TOKEN,
314     data => chr $self->{nc},
315     line => $self->{line}, column => $self->{column},
316     };
317     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318    
319     ## Stay in the state.
320    
321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322     $self->{line_prev} = $self->{line};
323     $self->{column_prev} = $self->{column};
324     $self->{column}++;
325     $self->{nc}
326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327     } else {
328     $self->{set_nc}->($self);
329     }
330    
331     return ($token);
332     redo A;
333     } elsif ($self->{state} == DATA_STATE) {
334     $self->{s_kwd} = '' unless defined $self->{s_kwd};
335     if ($self->{nc} == 0x0026) { # &
336     $self->{s_kwd} = '';
337     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338     not $self->{escape}) {
339    
340     ## NOTE: In the spec, the tokenizer is switched to the
341     ## "entity data state". In this implementation, the tokenizer
342     ## is switched to the |ENTITY_STATE|, which is an implementation
343     ## of the "consume a character reference" algorithm.
344     $self->{entity_add} = -1;
345     $self->{prev_state} = DATA_STATE;
346     $self->{state} = ENTITY_STATE;
347    
348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349     $self->{line_prev} = $self->{line};
350     $self->{column_prev} = $self->{column};
351     $self->{column}++;
352     $self->{nc}
353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354     } else {
355     $self->{set_nc}->($self);
356     }
357    
358     redo A;
359     } else {
360    
361     #
362     }
363     } elsif ($self->{nc} == 0x002D) { # -
364     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
366 wakaba 1.1
367     $self->{escape} = 1; # unless $self->{escape};
368     $self->{s_kwd} = '--';
369     #
370 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
371 wakaba 1.1
372     $self->{s_kwd} = '--';
373     #
374 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
375    
376     $self->{s_kwd} .= '-';
377     #
378 wakaba 1.1 } else {
379    
380 wakaba 1.5 $self->{s_kwd} = '-';
381 wakaba 1.1 #
382     }
383     }
384    
385     #
386     } elsif ($self->{nc} == 0x0021) { # !
387     if (length $self->{s_kwd}) {
388    
389     $self->{s_kwd} .= '!';
390     #
391     } else {
392    
393     #$self->{s_kwd} = '';
394     #
395     }
396     #
397     } elsif ($self->{nc} == 0x003C) { # <
398     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
399     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
400     not $self->{escape})) {
401    
402     $self->{state} = TAG_OPEN_STATE;
403    
404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
405     $self->{line_prev} = $self->{line};
406     $self->{column_prev} = $self->{column};
407     $self->{column}++;
408     $self->{nc}
409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
410     } else {
411     $self->{set_nc}->($self);
412     }
413    
414     redo A;
415     } else {
416    
417     $self->{s_kwd} = '';
418     #
419     }
420     } elsif ($self->{nc} == 0x003E) { # >
421     if ($self->{escape} and
422     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
423     if ($self->{s_kwd} eq '--') {
424    
425     delete $self->{escape};
426 wakaba 1.5 #
427 wakaba 1.1 } else {
428    
429 wakaba 1.5 #
430 wakaba 1.1 }
431 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
432    
433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
434     line => $self->{line_prev},
435     column => $self->{column_prev} - 1);
436     #
437 wakaba 1.1 } else {
438    
439 wakaba 1.5 #
440 wakaba 1.1 }
441    
442     $self->{s_kwd} = '';
443     #
444 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
445     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
446    
447     $self->{s_kwd} .= ']';
448     } elsif ($self->{s_kwd} eq ']]') {
449    
450     #
451     } else {
452    
453     $self->{s_kwd} = '';
454     }
455     #
456 wakaba 1.1 } elsif ($self->{nc} == -1) {
457    
458     $self->{s_kwd} = '';
459     return ({type => END_OF_FILE_TOKEN,
460     line => $self->{line}, column => $self->{column}});
461     last A; ## TODO: ok?
462     } else {
463    
464     $self->{s_kwd} = '';
465     #
466     }
467    
468     # Anything else
469     my $token = {type => CHARACTER_TOKEN,
470     data => chr $self->{nc},
471     line => $self->{line}, column => $self->{column},
472     };
473 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
474 wakaba 1.1 length $token->{data})) {
475     $self->{s_kwd} = '';
476     }
477    
478     ## Stay in the data state.
479 wakaba 1.5 if (not $self->{is_xml} and
480     $self->{content_model} == PCDATA_CONTENT_MODEL) {
481 wakaba 1.1
482     $self->{state} = PCDATA_STATE;
483     } else {
484    
485     ## Stay in the state.
486     }
487    
488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
489     $self->{line_prev} = $self->{line};
490     $self->{column_prev} = $self->{column};
491     $self->{column}++;
492     $self->{nc}
493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
494     } else {
495     $self->{set_nc}->($self);
496     }
497    
498     return ($token);
499     redo A;
500     } elsif ($self->{state} == TAG_OPEN_STATE) {
501     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
502     if ($self->{nc} == 0x002F) { # /
503    
504    
505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
506     $self->{line_prev} = $self->{line};
507     $self->{column_prev} = $self->{column};
508     $self->{column}++;
509     $self->{nc}
510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
511     } else {
512     $self->{set_nc}->($self);
513     }
514    
515     $self->{state} = CLOSE_TAG_OPEN_STATE;
516     redo A;
517     } elsif ($self->{nc} == 0x0021) { # !
518    
519     $self->{s_kwd} = '<' unless $self->{escape};
520     #
521     } else {
522    
523     #
524     }
525    
526     ## reconsume
527     $self->{state} = DATA_STATE;
528 wakaba 1.5 $self->{s_kwd} = '';
529 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
530     line => $self->{line_prev},
531     column => $self->{column_prev},
532     });
533     redo A;
534     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
535     if ($self->{nc} == 0x0021) { # !
536    
537     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
538    
539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
540     $self->{line_prev} = $self->{line};
541     $self->{column_prev} = $self->{column};
542     $self->{column}++;
543     $self->{nc}
544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
545     } else {
546     $self->{set_nc}->($self);
547     }
548    
549     redo A;
550     } elsif ($self->{nc} == 0x002F) { # /
551    
552     $self->{state} = CLOSE_TAG_OPEN_STATE;
553    
554     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
555     $self->{line_prev} = $self->{line};
556     $self->{column_prev} = $self->{column};
557     $self->{column}++;
558     $self->{nc}
559     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
560     } else {
561     $self->{set_nc}->($self);
562     }
563    
564     redo A;
565     } elsif (0x0041 <= $self->{nc} and
566     $self->{nc} <= 0x005A) { # A..Z
567    
568     $self->{ct}
569     = {type => START_TAG_TOKEN,
570 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
571 wakaba 1.1 line => $self->{line_prev},
572     column => $self->{column_prev}};
573     $self->{state} = TAG_NAME_STATE;
574    
575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
576     $self->{line_prev} = $self->{line};
577     $self->{column_prev} = $self->{column};
578     $self->{column}++;
579     $self->{nc}
580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
581     } else {
582     $self->{set_nc}->($self);
583     }
584    
585     redo A;
586     } elsif (0x0061 <= $self->{nc} and
587     $self->{nc} <= 0x007A) { # a..z
588    
589     $self->{ct} = {type => START_TAG_TOKEN,
590     tag_name => chr ($self->{nc}),
591     line => $self->{line_prev},
592     column => $self->{column_prev}};
593     $self->{state} = TAG_NAME_STATE;
594    
595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596     $self->{line_prev} = $self->{line};
597     $self->{column_prev} = $self->{column};
598     $self->{column}++;
599     $self->{nc}
600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601     } else {
602     $self->{set_nc}->($self);
603     }
604    
605     redo A;
606     } elsif ($self->{nc} == 0x003E) { # >
607    
608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
609     line => $self->{line_prev},
610     column => $self->{column_prev});
611     $self->{state} = DATA_STATE;
612 wakaba 1.5 $self->{s_kwd} = '';
613 wakaba 1.1
614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
615     $self->{line_prev} = $self->{line};
616     $self->{column_prev} = $self->{column};
617     $self->{column}++;
618     $self->{nc}
619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
620     } else {
621     $self->{set_nc}->($self);
622     }
623    
624    
625     return ({type => CHARACTER_TOKEN, data => '<>',
626     line => $self->{line_prev},
627     column => $self->{column_prev},
628     });
629    
630     redo A;
631     } elsif ($self->{nc} == 0x003F) { # ?
632    
633     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
634     line => $self->{line_prev},
635     column => $self->{column_prev});
636     $self->{state} = BOGUS_COMMENT_STATE;
637     $self->{ct} = {type => COMMENT_TOKEN, data => '',
638     line => $self->{line_prev},
639     column => $self->{column_prev},
640     };
641     ## $self->{nc} is intentionally left as is
642     redo A;
643     } else {
644    
645     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
646     line => $self->{line_prev},
647     column => $self->{column_prev});
648     $self->{state} = DATA_STATE;
649 wakaba 1.5 $self->{s_kwd} = '';
650 wakaba 1.1 ## reconsume
651    
652     return ({type => CHARACTER_TOKEN, data => '<',
653     line => $self->{line_prev},
654     column => $self->{column_prev},
655     });
656    
657     redo A;
658     }
659     } else {
660     die "$0: $self->{content_model} in tag open";
661     }
662     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
663     ## NOTE: The "close tag open state" in the spec is implemented as
664     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
665    
666     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
667     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
668     if (defined $self->{last_stag_name}) {
669     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
670     $self->{s_kwd} = '';
671     ## Reconsume.
672     redo A;
673     } else {
674     ## No start tag token has ever been emitted
675     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
676    
677     $self->{state} = DATA_STATE;
678 wakaba 1.5 $self->{s_kwd} = '';
679 wakaba 1.1 ## Reconsume.
680     return ({type => CHARACTER_TOKEN, data => '</',
681     line => $l, column => $c,
682     });
683     redo A;
684     }
685     }
686    
687     if (0x0041 <= $self->{nc} and
688     $self->{nc} <= 0x005A) { # A..Z
689    
690     $self->{ct}
691     = {type => END_TAG_TOKEN,
692 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
693 wakaba 1.1 line => $l, column => $c};
694     $self->{state} = TAG_NAME_STATE;
695    
696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
697     $self->{line_prev} = $self->{line};
698     $self->{column_prev} = $self->{column};
699     $self->{column}++;
700     $self->{nc}
701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
702     } else {
703     $self->{set_nc}->($self);
704     }
705    
706     redo A;
707     } elsif (0x0061 <= $self->{nc} and
708     $self->{nc} <= 0x007A) { # a..z
709    
710     $self->{ct} = {type => END_TAG_TOKEN,
711     tag_name => chr ($self->{nc}),
712     line => $l, column => $c};
713     $self->{state} = TAG_NAME_STATE;
714    
715     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
716     $self->{line_prev} = $self->{line};
717     $self->{column_prev} = $self->{column};
718     $self->{column}++;
719     $self->{nc}
720     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
721     } else {
722     $self->{set_nc}->($self);
723     }
724    
725     redo A;
726     } elsif ($self->{nc} == 0x003E) { # >
727    
728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
729     line => $self->{line_prev}, ## "<" in "</>"
730     column => $self->{column_prev} - 1);
731     $self->{state} = DATA_STATE;
732 wakaba 1.5 $self->{s_kwd} = '';
733 wakaba 1.1
734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
735     $self->{line_prev} = $self->{line};
736     $self->{column_prev} = $self->{column};
737     $self->{column}++;
738     $self->{nc}
739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
740     } else {
741     $self->{set_nc}->($self);
742     }
743    
744     redo A;
745     } elsif ($self->{nc} == -1) {
746    
747     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
748 wakaba 1.5 $self->{s_kwd} = '';
749 wakaba 1.1 $self->{state} = DATA_STATE;
750     # reconsume
751    
752     return ({type => CHARACTER_TOKEN, data => '</',
753     line => $l, column => $c,
754     });
755    
756     redo A;
757     } else {
758    
759     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
760     $self->{state} = BOGUS_COMMENT_STATE;
761     $self->{ct} = {type => COMMENT_TOKEN, data => '',
762     line => $self->{line_prev}, # "<" of "</"
763     column => $self->{column_prev} - 1,
764     };
765     ## NOTE: $self->{nc} is intentionally left as is.
766     ## Although the "anything else" case of the spec not explicitly
767     ## states that the next input character is to be reconsumed,
768     ## it will be included to the |data| of the comment token
769     ## generated from the bogus end tag, as defined in the
770     ## "bogus comment state" entry.
771     redo A;
772     }
773     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
774     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
775     if (length $ch) {
776     my $CH = $ch;
777     $ch =~ tr/a-z/A-Z/;
778     my $nch = chr $self->{nc};
779     if ($nch eq $ch or $nch eq $CH) {
780    
781     ## Stay in the state.
782     $self->{s_kwd} .= $nch;
783    
784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
785     $self->{line_prev} = $self->{line};
786     $self->{column_prev} = $self->{column};
787     $self->{column}++;
788     $self->{nc}
789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
790     } else {
791     $self->{set_nc}->($self);
792     }
793    
794     redo A;
795     } else {
796    
797     $self->{state} = DATA_STATE;
798 wakaba 1.5 $self->{s_kwd} = '';
799 wakaba 1.1 ## Reconsume.
800     return ({type => CHARACTER_TOKEN,
801     data => '</' . $self->{s_kwd},
802     line => $self->{line_prev},
803     column => $self->{column_prev} - 1 - length $self->{s_kwd},
804     });
805     redo A;
806     }
807     } else { # after "<{tag-name}"
808     unless ($is_space->{$self->{nc}} or
809     {
810     0x003E => 1, # >
811     0x002F => 1, # /
812     -1 => 1, # EOF
813     }->{$self->{nc}}) {
814    
815     ## Reconsume.
816     $self->{state} = DATA_STATE;
817 wakaba 1.5 $self->{s_kwd} = '';
818 wakaba 1.1 return ({type => CHARACTER_TOKEN,
819     data => '</' . $self->{s_kwd},
820     line => $self->{line_prev},
821     column => $self->{column_prev} - 1 - length $self->{s_kwd},
822     });
823     redo A;
824     } else {
825    
826     $self->{ct}
827     = {type => END_TAG_TOKEN,
828     tag_name => $self->{last_stag_name},
829     line => $self->{line_prev},
830     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
831     $self->{state} = TAG_NAME_STATE;
832     ## Reconsume.
833     redo A;
834     }
835     }
836     } elsif ($self->{state} == TAG_NAME_STATE) {
837     if ($is_space->{$self->{nc}}) {
838    
839     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
840    
841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
842     $self->{line_prev} = $self->{line};
843     $self->{column_prev} = $self->{column};
844     $self->{column}++;
845     $self->{nc}
846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
847     } else {
848     $self->{set_nc}->($self);
849     }
850    
851     redo A;
852     } elsif ($self->{nc} == 0x003E) { # >
853     if ($self->{ct}->{type} == START_TAG_TOKEN) {
854    
855     $self->{last_stag_name} = $self->{ct}->{tag_name};
856     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
857     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
858     #if ($self->{ct}->{attributes}) {
859     # ## NOTE: This should never be reached.
860     # !!! cp (36);
861     # !!! parse-error (type => 'end tag attribute');
862     #} else {
863    
864     #}
865     } else {
866     die "$0: $self->{ct}->{type}: Unknown token type";
867     }
868     $self->{state} = DATA_STATE;
869 wakaba 1.5 $self->{s_kwd} = '';
870 wakaba 1.1
871     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
872     $self->{line_prev} = $self->{line};
873     $self->{column_prev} = $self->{column};
874     $self->{column}++;
875     $self->{nc}
876     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
877     } else {
878     $self->{set_nc}->($self);
879     }
880    
881    
882     return ($self->{ct}); # start tag or end tag
883    
884     redo A;
885     } elsif (0x0041 <= $self->{nc} and
886     $self->{nc} <= 0x005A) { # A..Z
887    
888 wakaba 1.4 $self->{ct}->{tag_name}
889     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
890 wakaba 1.1 # start tag or end tag
891     ## Stay in this state
892    
893     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
894     $self->{line_prev} = $self->{line};
895     $self->{column_prev} = $self->{column};
896     $self->{column}++;
897     $self->{nc}
898     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
899     } else {
900     $self->{set_nc}->($self);
901     }
902    
903     redo A;
904     } elsif ($self->{nc} == -1) {
905     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
906     if ($self->{ct}->{type} == START_TAG_TOKEN) {
907    
908     $self->{last_stag_name} = $self->{ct}->{tag_name};
909     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
910     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
911     #if ($self->{ct}->{attributes}) {
912     # ## NOTE: This state should never be reached.
913     # !!! cp (40);
914     # !!! parse-error (type => 'end tag attribute');
915     #} else {
916    
917     #}
918     } else {
919     die "$0: $self->{ct}->{type}: Unknown token type";
920     }
921     $self->{state} = DATA_STATE;
922 wakaba 1.5 $self->{s_kwd} = '';
923 wakaba 1.1 # reconsume
924    
925     return ($self->{ct}); # start tag or end tag
926    
927     redo A;
928     } elsif ($self->{nc} == 0x002F) { # /
929    
930     $self->{state} = SELF_CLOSING_START_TAG_STATE;
931    
932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933     $self->{line_prev} = $self->{line};
934     $self->{column_prev} = $self->{column};
935     $self->{column}++;
936     $self->{nc}
937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938     } else {
939     $self->{set_nc}->($self);
940     }
941    
942     redo A;
943     } else {
944    
945     $self->{ct}->{tag_name} .= chr $self->{nc};
946     # start tag or end tag
947     ## Stay in the state
948    
949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
950     $self->{line_prev} = $self->{line};
951     $self->{column_prev} = $self->{column};
952     $self->{column}++;
953     $self->{nc}
954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
955     } else {
956     $self->{set_nc}->($self);
957     }
958    
959     redo A;
960     }
961     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
962     if ($is_space->{$self->{nc}}) {
963    
964     ## Stay in the state
965    
966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
967     $self->{line_prev} = $self->{line};
968     $self->{column_prev} = $self->{column};
969     $self->{column}++;
970     $self->{nc}
971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
972     } else {
973     $self->{set_nc}->($self);
974     }
975    
976     redo A;
977     } elsif ($self->{nc} == 0x003E) { # >
978     if ($self->{ct}->{type} == START_TAG_TOKEN) {
979    
980     $self->{last_stag_name} = $self->{ct}->{tag_name};
981     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
982     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
983     if ($self->{ct}->{attributes}) {
984    
985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
986     } else {
987    
988     }
989     } else {
990     die "$0: $self->{ct}->{type}: Unknown token type";
991     }
992     $self->{state} = DATA_STATE;
993 wakaba 1.5 $self->{s_kwd} = '';
994 wakaba 1.1
995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996     $self->{line_prev} = $self->{line};
997     $self->{column_prev} = $self->{column};
998     $self->{column}++;
999     $self->{nc}
1000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001     } else {
1002     $self->{set_nc}->($self);
1003     }
1004    
1005    
1006     return ($self->{ct}); # start tag or end tag
1007    
1008     redo A;
1009     } elsif (0x0041 <= $self->{nc} and
1010     $self->{nc} <= 0x005A) { # A..Z
1011    
1012     $self->{ca}
1013 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1014 wakaba 1.1 value => '',
1015     line => $self->{line}, column => $self->{column}};
1016     $self->{state} = ATTRIBUTE_NAME_STATE;
1017    
1018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1019     $self->{line_prev} = $self->{line};
1020     $self->{column_prev} = $self->{column};
1021     $self->{column}++;
1022     $self->{nc}
1023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1024     } else {
1025     $self->{set_nc}->($self);
1026     }
1027    
1028     redo A;
1029     } elsif ($self->{nc} == 0x002F) { # /
1030    
1031     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1032    
1033     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034     $self->{line_prev} = $self->{line};
1035     $self->{column_prev} = $self->{column};
1036     $self->{column}++;
1037     $self->{nc}
1038     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1039     } else {
1040     $self->{set_nc}->($self);
1041     }
1042    
1043     redo A;
1044     } elsif ($self->{nc} == -1) {
1045     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1046     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1047    
1048     $self->{last_stag_name} = $self->{ct}->{tag_name};
1049     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1050     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1051     if ($self->{ct}->{attributes}) {
1052    
1053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1054     } else {
1055    
1056     }
1057     } else {
1058     die "$0: $self->{ct}->{type}: Unknown token type";
1059     }
1060     $self->{state} = DATA_STATE;
1061 wakaba 1.5 $self->{s_kwd} = '';
1062 wakaba 1.1 # reconsume
1063    
1064     return ($self->{ct}); # start tag or end tag
1065    
1066     redo A;
1067     } else {
1068     if ({
1069     0x0022 => 1, # "
1070     0x0027 => 1, # '
1071     0x003D => 1, # =
1072     }->{$self->{nc}}) {
1073    
1074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1075     } else {
1076    
1077     }
1078     $self->{ca}
1079     = {name => chr ($self->{nc}),
1080     value => '',
1081     line => $self->{line}, column => $self->{column}};
1082     $self->{state} = ATTRIBUTE_NAME_STATE;
1083    
1084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1085     $self->{line_prev} = $self->{line};
1086     $self->{column_prev} = $self->{column};
1087     $self->{column}++;
1088     $self->{nc}
1089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1090     } else {
1091     $self->{set_nc}->($self);
1092     }
1093    
1094     redo A;
1095     }
1096     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1097     my $before_leave = sub {
1098     if (exists $self->{ct}->{attributes} # start tag or end tag
1099     ->{$self->{ca}->{name}}) { # MUST
1100    
1101     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1102     ## Discard $self->{ca} # MUST
1103     } else {
1104    
1105     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1106     = $self->{ca};
1107     }
1108     }; # $before_leave
1109    
1110     if ($is_space->{$self->{nc}}) {
1111    
1112     $before_leave->();
1113     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1114    
1115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1116     $self->{line_prev} = $self->{line};
1117     $self->{column_prev} = $self->{column};
1118     $self->{column}++;
1119     $self->{nc}
1120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1121     } else {
1122     $self->{set_nc}->($self);
1123     }
1124    
1125     redo A;
1126     } elsif ($self->{nc} == 0x003D) { # =
1127    
1128     $before_leave->();
1129     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1130    
1131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1132     $self->{line_prev} = $self->{line};
1133     $self->{column_prev} = $self->{column};
1134     $self->{column}++;
1135     $self->{nc}
1136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1137     } else {
1138     $self->{set_nc}->($self);
1139     }
1140    
1141     redo A;
1142     } elsif ($self->{nc} == 0x003E) { # >
1143     $before_leave->();
1144     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1145    
1146     $self->{last_stag_name} = $self->{ct}->{tag_name};
1147     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1148    
1149     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1150     if ($self->{ct}->{attributes}) {
1151     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1152     }
1153     } else {
1154     die "$0: $self->{ct}->{type}: Unknown token type";
1155     }
1156     $self->{state} = DATA_STATE;
1157 wakaba 1.5 $self->{s_kwd} = '';
1158 wakaba 1.1
1159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160     $self->{line_prev} = $self->{line};
1161     $self->{column_prev} = $self->{column};
1162     $self->{column}++;
1163     $self->{nc}
1164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1165     } else {
1166     $self->{set_nc}->($self);
1167     }
1168    
1169    
1170     return ($self->{ct}); # start tag or end tag
1171    
1172     redo A;
1173     } elsif (0x0041 <= $self->{nc} and
1174     $self->{nc} <= 0x005A) { # A..Z
1175    
1176 wakaba 1.4 $self->{ca}->{name}
1177     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1178 wakaba 1.1 ## Stay in the state
1179    
1180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1181     $self->{line_prev} = $self->{line};
1182     $self->{column_prev} = $self->{column};
1183     $self->{column}++;
1184     $self->{nc}
1185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1186     } else {
1187     $self->{set_nc}->($self);
1188     }
1189    
1190     redo A;
1191     } elsif ($self->{nc} == 0x002F) { # /
1192    
1193     $before_leave->();
1194     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1195    
1196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1197     $self->{line_prev} = $self->{line};
1198     $self->{column_prev} = $self->{column};
1199     $self->{column}++;
1200     $self->{nc}
1201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1202     } else {
1203     $self->{set_nc}->($self);
1204     }
1205    
1206     redo A;
1207     } elsif ($self->{nc} == -1) {
1208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1209     $before_leave->();
1210     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1211    
1212     $self->{last_stag_name} = $self->{ct}->{tag_name};
1213     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1214     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1215     if ($self->{ct}->{attributes}) {
1216    
1217     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1218     } else {
1219     ## NOTE: This state should never be reached.
1220    
1221     }
1222     } else {
1223     die "$0: $self->{ct}->{type}: Unknown token type";
1224     }
1225     $self->{state} = DATA_STATE;
1226 wakaba 1.5 $self->{s_kwd} = '';
1227 wakaba 1.1 # reconsume
1228    
1229     return ($self->{ct}); # start tag or end tag
1230    
1231     redo A;
1232     } else {
1233     if ($self->{nc} == 0x0022 or # "
1234     $self->{nc} == 0x0027) { # '
1235    
1236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1237     } else {
1238    
1239     }
1240     $self->{ca}->{name} .= chr ($self->{nc});
1241     ## Stay in the state
1242    
1243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1244     $self->{line_prev} = $self->{line};
1245     $self->{column_prev} = $self->{column};
1246     $self->{column}++;
1247     $self->{nc}
1248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1249     } else {
1250     $self->{set_nc}->($self);
1251     }
1252    
1253     redo A;
1254     }
1255     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1256     if ($is_space->{$self->{nc}}) {
1257    
1258     ## Stay in the state
1259    
1260     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1261     $self->{line_prev} = $self->{line};
1262     $self->{column_prev} = $self->{column};
1263     $self->{column}++;
1264     $self->{nc}
1265     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1266     } else {
1267     $self->{set_nc}->($self);
1268     }
1269    
1270     redo A;
1271     } elsif ($self->{nc} == 0x003D) { # =
1272    
1273     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1274    
1275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1276     $self->{line_prev} = $self->{line};
1277     $self->{column_prev} = $self->{column};
1278     $self->{column}++;
1279     $self->{nc}
1280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1281     } else {
1282     $self->{set_nc}->($self);
1283     }
1284    
1285     redo A;
1286     } elsif ($self->{nc} == 0x003E) { # >
1287     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1288    
1289     $self->{last_stag_name} = $self->{ct}->{tag_name};
1290     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1291     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1292     if ($self->{ct}->{attributes}) {
1293    
1294     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1295     } else {
1296     ## NOTE: This state should never be reached.
1297    
1298     }
1299     } else {
1300     die "$0: $self->{ct}->{type}: Unknown token type";
1301     }
1302     $self->{state} = DATA_STATE;
1303 wakaba 1.5 $self->{s_kwd} = '';
1304 wakaba 1.1
1305     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1306     $self->{line_prev} = $self->{line};
1307     $self->{column_prev} = $self->{column};
1308     $self->{column}++;
1309     $self->{nc}
1310     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1311     } else {
1312     $self->{set_nc}->($self);
1313     }
1314    
1315    
1316     return ($self->{ct}); # start tag or end tag
1317    
1318     redo A;
1319     } elsif (0x0041 <= $self->{nc} and
1320     $self->{nc} <= 0x005A) { # A..Z
1321    
1322     $self->{ca}
1323 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1324 wakaba 1.1 value => '',
1325     line => $self->{line}, column => $self->{column}};
1326     $self->{state} = ATTRIBUTE_NAME_STATE;
1327    
1328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1329     $self->{line_prev} = $self->{line};
1330     $self->{column_prev} = $self->{column};
1331     $self->{column}++;
1332     $self->{nc}
1333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1334     } else {
1335     $self->{set_nc}->($self);
1336     }
1337    
1338     redo A;
1339     } elsif ($self->{nc} == 0x002F) { # /
1340    
1341     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1342    
1343     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1344     $self->{line_prev} = $self->{line};
1345     $self->{column_prev} = $self->{column};
1346     $self->{column}++;
1347     $self->{nc}
1348     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1349     } else {
1350     $self->{set_nc}->($self);
1351     }
1352    
1353     redo A;
1354     } elsif ($self->{nc} == -1) {
1355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1356     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1357    
1358     $self->{last_stag_name} = $self->{ct}->{tag_name};
1359     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1360     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1361     if ($self->{ct}->{attributes}) {
1362    
1363     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1364     } else {
1365     ## NOTE: This state should never be reached.
1366    
1367     }
1368     } else {
1369     die "$0: $self->{ct}->{type}: Unknown token type";
1370     }
1371 wakaba 1.5 $self->{s_kwd} = '';
1372 wakaba 1.1 $self->{state} = DATA_STATE;
1373     # reconsume
1374    
1375     return ($self->{ct}); # start tag or end tag
1376    
1377     redo A;
1378     } else {
1379     if ($self->{nc} == 0x0022 or # "
1380     $self->{nc} == 0x0027) { # '
1381    
1382     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1383     } else {
1384    
1385     }
1386     $self->{ca}
1387     = {name => chr ($self->{nc}),
1388     value => '',
1389     line => $self->{line}, column => $self->{column}};
1390     $self->{state} = ATTRIBUTE_NAME_STATE;
1391    
1392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1393     $self->{line_prev} = $self->{line};
1394     $self->{column_prev} = $self->{column};
1395     $self->{column}++;
1396     $self->{nc}
1397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1398     } else {
1399     $self->{set_nc}->($self);
1400     }
1401    
1402     redo A;
1403     }
1404     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1405     if ($is_space->{$self->{nc}}) {
1406    
1407     ## Stay in the state
1408    
1409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1410     $self->{line_prev} = $self->{line};
1411     $self->{column_prev} = $self->{column};
1412     $self->{column}++;
1413     $self->{nc}
1414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1415     } else {
1416     $self->{set_nc}->($self);
1417     }
1418    
1419     redo A;
1420     } elsif ($self->{nc} == 0x0022) { # "
1421    
1422     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1423    
1424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1425     $self->{line_prev} = $self->{line};
1426     $self->{column_prev} = $self->{column};
1427     $self->{column}++;
1428     $self->{nc}
1429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1430     } else {
1431     $self->{set_nc}->($self);
1432     }
1433    
1434     redo A;
1435     } elsif ($self->{nc} == 0x0026) { # &
1436    
1437     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1438     ## reconsume
1439     redo A;
1440     } elsif ($self->{nc} == 0x0027) { # '
1441    
1442     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     } elsif ($self->{nc} == 0x003E) { # >
1456     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1457     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1458    
1459     $self->{last_stag_name} = $self->{ct}->{tag_name};
1460     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1461     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1462     if ($self->{ct}->{attributes}) {
1463    
1464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1465     } else {
1466     ## NOTE: This state should never be reached.
1467    
1468     }
1469     } else {
1470     die "$0: $self->{ct}->{type}: Unknown token type";
1471     }
1472     $self->{state} = DATA_STATE;
1473 wakaba 1.5 $self->{s_kwd} = '';
1474 wakaba 1.1
1475     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1476     $self->{line_prev} = $self->{line};
1477     $self->{column_prev} = $self->{column};
1478     $self->{column}++;
1479     $self->{nc}
1480     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1481     } else {
1482     $self->{set_nc}->($self);
1483     }
1484    
1485    
1486     return ($self->{ct}); # start tag or end tag
1487    
1488     redo A;
1489     } elsif ($self->{nc} == -1) {
1490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1491     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1492    
1493     $self->{last_stag_name} = $self->{ct}->{tag_name};
1494     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1495     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1496     if ($self->{ct}->{attributes}) {
1497    
1498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1499     } else {
1500     ## NOTE: This state should never be reached.
1501    
1502     }
1503     } else {
1504     die "$0: $self->{ct}->{type}: Unknown token type";
1505     }
1506     $self->{state} = DATA_STATE;
1507 wakaba 1.5 $self->{s_kwd} = '';
1508 wakaba 1.1 ## reconsume
1509    
1510     return ($self->{ct}); # start tag or end tag
1511    
1512     redo A;
1513     } else {
1514     if ($self->{nc} == 0x003D) { # =
1515    
1516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1517     } else {
1518    
1519     }
1520     $self->{ca}->{value} .= chr ($self->{nc});
1521     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1522    
1523     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524     $self->{line_prev} = $self->{line};
1525     $self->{column_prev} = $self->{column};
1526     $self->{column}++;
1527     $self->{nc}
1528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529     } else {
1530     $self->{set_nc}->($self);
1531     }
1532    
1533     redo A;
1534     }
1535     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1536     if ($self->{nc} == 0x0022) { # "
1537    
1538     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1539    
1540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1541     $self->{line_prev} = $self->{line};
1542     $self->{column_prev} = $self->{column};
1543     $self->{column}++;
1544     $self->{nc}
1545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1546     } else {
1547     $self->{set_nc}->($self);
1548     }
1549    
1550     redo A;
1551     } elsif ($self->{nc} == 0x0026) { # &
1552    
1553     ## NOTE: In the spec, the tokenizer is switched to the
1554     ## "entity in attribute value state". In this implementation, the
1555     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1556     ## implementation of the "consume a character reference" algorithm.
1557     $self->{prev_state} = $self->{state};
1558     $self->{entity_add} = 0x0022; # "
1559     $self->{state} = ENTITY_STATE;
1560    
1561     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562     $self->{line_prev} = $self->{line};
1563     $self->{column_prev} = $self->{column};
1564     $self->{column}++;
1565     $self->{nc}
1566     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567     } else {
1568     $self->{set_nc}->($self);
1569     }
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1574     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575    
1576     $self->{last_stag_name} = $self->{ct}->{tag_name};
1577     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579     if ($self->{ct}->{attributes}) {
1580    
1581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582     } else {
1583     ## NOTE: This state should never be reached.
1584    
1585     }
1586     } else {
1587     die "$0: $self->{ct}->{type}: Unknown token type";
1588     }
1589     $self->{state} = DATA_STATE;
1590 wakaba 1.5 $self->{s_kwd} = '';
1591 wakaba 1.1 ## reconsume
1592    
1593     return ($self->{ct}); # start tag or end tag
1594    
1595     redo A;
1596     } else {
1597    
1598     $self->{ca}->{value} .= chr ($self->{nc});
1599     $self->{read_until}->($self->{ca}->{value},
1600     q["&],
1601     length $self->{ca}->{value});
1602    
1603     ## Stay in the state
1604    
1605     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1606     $self->{line_prev} = $self->{line};
1607     $self->{column_prev} = $self->{column};
1608     $self->{column}++;
1609     $self->{nc}
1610     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1611     } else {
1612     $self->{set_nc}->($self);
1613     }
1614    
1615     redo A;
1616     }
1617     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1618     if ($self->{nc} == 0x0027) { # '
1619    
1620     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1621    
1622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623     $self->{line_prev} = $self->{line};
1624     $self->{column_prev} = $self->{column};
1625     $self->{column}++;
1626     $self->{nc}
1627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628     } else {
1629     $self->{set_nc}->($self);
1630     }
1631    
1632     redo A;
1633     } elsif ($self->{nc} == 0x0026) { # &
1634    
1635     ## NOTE: In the spec, the tokenizer is switched to the
1636     ## "entity in attribute value state". In this implementation, the
1637     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1638     ## implementation of the "consume a character reference" algorithm.
1639     $self->{entity_add} = 0x0027; # '
1640     $self->{prev_state} = $self->{state};
1641     $self->{state} = ENTITY_STATE;
1642    
1643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644     $self->{line_prev} = $self->{line};
1645     $self->{column_prev} = $self->{column};
1646     $self->{column}++;
1647     $self->{nc}
1648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649     } else {
1650     $self->{set_nc}->($self);
1651     }
1652    
1653     redo A;
1654     } elsif ($self->{nc} == -1) {
1655     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1656     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1657    
1658     $self->{last_stag_name} = $self->{ct}->{tag_name};
1659     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1661     if ($self->{ct}->{attributes}) {
1662    
1663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1664     } else {
1665     ## NOTE: This state should never be reached.
1666    
1667     }
1668     } else {
1669     die "$0: $self->{ct}->{type}: Unknown token type";
1670     }
1671     $self->{state} = DATA_STATE;
1672 wakaba 1.5 $self->{s_kwd} = '';
1673 wakaba 1.1 ## reconsume
1674    
1675     return ($self->{ct}); # start tag or end tag
1676    
1677     redo A;
1678     } else {
1679    
1680     $self->{ca}->{value} .= chr ($self->{nc});
1681     $self->{read_until}->($self->{ca}->{value},
1682     q['&],
1683     length $self->{ca}->{value});
1684    
1685     ## Stay in the state
1686    
1687     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1688     $self->{line_prev} = $self->{line};
1689     $self->{column_prev} = $self->{column};
1690     $self->{column}++;
1691     $self->{nc}
1692     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1693     } else {
1694     $self->{set_nc}->($self);
1695     }
1696    
1697     redo A;
1698     }
1699     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1700     if ($is_space->{$self->{nc}}) {
1701    
1702     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1703    
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714     redo A;
1715     } elsif ($self->{nc} == 0x0026) { # &
1716    
1717     ## NOTE: In the spec, the tokenizer is switched to the
1718     ## "entity in attribute value state". In this implementation, the
1719     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1720     ## implementation of the "consume a character reference" algorithm.
1721     $self->{entity_add} = -1;
1722     $self->{prev_state} = $self->{state};
1723     $self->{state} = ENTITY_STATE;
1724    
1725     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1726     $self->{line_prev} = $self->{line};
1727     $self->{column_prev} = $self->{column};
1728     $self->{column}++;
1729     $self->{nc}
1730     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1731     } else {
1732     $self->{set_nc}->($self);
1733     }
1734    
1735     redo A;
1736     } elsif ($self->{nc} == 0x003E) { # >
1737     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1738    
1739     $self->{last_stag_name} = $self->{ct}->{tag_name};
1740     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1741     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1742     if ($self->{ct}->{attributes}) {
1743    
1744     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1745     } else {
1746     ## NOTE: This state should never be reached.
1747    
1748     }
1749     } else {
1750     die "$0: $self->{ct}->{type}: Unknown token type";
1751     }
1752     $self->{state} = DATA_STATE;
1753 wakaba 1.5 $self->{s_kwd} = '';
1754 wakaba 1.1
1755     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1756     $self->{line_prev} = $self->{line};
1757     $self->{column_prev} = $self->{column};
1758     $self->{column}++;
1759     $self->{nc}
1760     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1761     } else {
1762     $self->{set_nc}->($self);
1763     }
1764    
1765    
1766     return ($self->{ct}); # start tag or end tag
1767    
1768     redo A;
1769     } elsif ($self->{nc} == -1) {
1770     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1771     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1772    
1773     $self->{last_stag_name} = $self->{ct}->{tag_name};
1774     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1775     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1776     if ($self->{ct}->{attributes}) {
1777    
1778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1779     } else {
1780     ## NOTE: This state should never be reached.
1781    
1782     }
1783     } else {
1784     die "$0: $self->{ct}->{type}: Unknown token type";
1785     }
1786     $self->{state} = DATA_STATE;
1787 wakaba 1.5 $self->{s_kwd} = '';
1788 wakaba 1.1 ## reconsume
1789    
1790     return ($self->{ct}); # start tag or end tag
1791    
1792     redo A;
1793     } else {
1794     if ({
1795     0x0022 => 1, # "
1796     0x0027 => 1, # '
1797     0x003D => 1, # =
1798     }->{$self->{nc}}) {
1799    
1800     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1801     } else {
1802    
1803     }
1804     $self->{ca}->{value} .= chr ($self->{nc});
1805     $self->{read_until}->($self->{ca}->{value},
1806     q["'=& >],
1807     length $self->{ca}->{value});
1808    
1809     ## Stay in the state
1810    
1811     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1812     $self->{line_prev} = $self->{line};
1813     $self->{column_prev} = $self->{column};
1814     $self->{column}++;
1815     $self->{nc}
1816     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1817     } else {
1818     $self->{set_nc}->($self);
1819     }
1820    
1821     redo A;
1822     }
1823     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1824     if ($is_space->{$self->{nc}}) {
1825    
1826     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1827    
1828     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1829     $self->{line_prev} = $self->{line};
1830     $self->{column_prev} = $self->{column};
1831     $self->{column}++;
1832     $self->{nc}
1833     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1834     } else {
1835     $self->{set_nc}->($self);
1836     }
1837    
1838     redo A;
1839     } elsif ($self->{nc} == 0x003E) { # >
1840     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1841    
1842     $self->{last_stag_name} = $self->{ct}->{tag_name};
1843     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1844     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1845     if ($self->{ct}->{attributes}) {
1846    
1847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1848     } else {
1849     ## NOTE: This state should never be reached.
1850    
1851     }
1852     } else {
1853     die "$0: $self->{ct}->{type}: Unknown token type";
1854     }
1855     $self->{state} = DATA_STATE;
1856 wakaba 1.5 $self->{s_kwd} = '';
1857 wakaba 1.1
1858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1859     $self->{line_prev} = $self->{line};
1860     $self->{column_prev} = $self->{column};
1861     $self->{column}++;
1862     $self->{nc}
1863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1864     } else {
1865     $self->{set_nc}->($self);
1866     }
1867    
1868    
1869     return ($self->{ct}); # start tag or end tag
1870    
1871     redo A;
1872     } elsif ($self->{nc} == 0x002F) { # /
1873    
1874     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1875    
1876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1877     $self->{line_prev} = $self->{line};
1878     $self->{column_prev} = $self->{column};
1879     $self->{column}++;
1880     $self->{nc}
1881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1882     } else {
1883     $self->{set_nc}->($self);
1884     }
1885    
1886     redo A;
1887     } elsif ($self->{nc} == -1) {
1888     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1889     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1890    
1891     $self->{last_stag_name} = $self->{ct}->{tag_name};
1892     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1893     if ($self->{ct}->{attributes}) {
1894    
1895     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1896     } else {
1897     ## NOTE: This state should never be reached.
1898    
1899     }
1900     } else {
1901     die "$0: $self->{ct}->{type}: Unknown token type";
1902     }
1903     $self->{state} = DATA_STATE;
1904 wakaba 1.5 $self->{s_kwd} = '';
1905 wakaba 1.1 ## Reconsume.
1906     return ($self->{ct}); # start tag or end tag
1907     redo A;
1908     } else {
1909    
1910     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1911     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1912     ## reconsume
1913     redo A;
1914     }
1915     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1916     if ($self->{nc} == 0x003E) { # >
1917     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1918    
1919     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1920     ## TODO: Different type than slash in start tag
1921     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1922     if ($self->{ct}->{attributes}) {
1923    
1924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1925     } else {
1926    
1927     }
1928     ## TODO: Test |<title></title/>|
1929     } else {
1930    
1931     $self->{self_closing} = 1;
1932     }
1933    
1934     $self->{state} = DATA_STATE;
1935 wakaba 1.5 $self->{s_kwd} = '';
1936 wakaba 1.1
1937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938     $self->{line_prev} = $self->{line};
1939     $self->{column_prev} = $self->{column};
1940     $self->{column}++;
1941     $self->{nc}
1942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943     } else {
1944     $self->{set_nc}->($self);
1945     }
1946    
1947    
1948     return ($self->{ct}); # start tag or end tag
1949    
1950     redo A;
1951     } elsif ($self->{nc} == -1) {
1952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1953     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1954    
1955     $self->{last_stag_name} = $self->{ct}->{tag_name};
1956     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1957     if ($self->{ct}->{attributes}) {
1958    
1959     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1960     } else {
1961     ## NOTE: This state should never be reached.
1962    
1963     }
1964     } else {
1965     die "$0: $self->{ct}->{type}: Unknown token type";
1966     }
1967     $self->{state} = DATA_STATE;
1968 wakaba 1.5 $self->{s_kwd} = '';
1969 wakaba 1.1 ## Reconsume.
1970     return ($self->{ct}); # start tag or end tag
1971     redo A;
1972     } else {
1973    
1974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1975     ## TODO: This error type is wrong.
1976     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1977     ## Reconsume.
1978     redo A;
1979     }
1980     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1981     ## (only happen if PCDATA state)
1982    
1983     ## NOTE: Unlike spec's "bogus comment state", this implementation
1984     ## consumes characters one-by-one basis.
1985    
1986     if ($self->{nc} == 0x003E) { # >
1987    
1988     $self->{state} = DATA_STATE;
1989 wakaba 1.5 $self->{s_kwd} = '';
1990 wakaba 1.1
1991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1992     $self->{line_prev} = $self->{line};
1993     $self->{column_prev} = $self->{column};
1994     $self->{column}++;
1995     $self->{nc}
1996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1997     } else {
1998     $self->{set_nc}->($self);
1999     }
2000    
2001    
2002     return ($self->{ct}); # comment
2003     redo A;
2004     } elsif ($self->{nc} == -1) {
2005    
2006     $self->{state} = DATA_STATE;
2007 wakaba 1.5 $self->{s_kwd} = '';
2008 wakaba 1.1 ## reconsume
2009    
2010     return ($self->{ct}); # comment
2011     redo A;
2012     } else {
2013    
2014     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2015     $self->{read_until}->($self->{ct}->{data},
2016     q[>],
2017     length $self->{ct}->{data});
2018    
2019     ## Stay in the state.
2020    
2021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2022     $self->{line_prev} = $self->{line};
2023     $self->{column_prev} = $self->{column};
2024     $self->{column}++;
2025     $self->{nc}
2026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2027     } else {
2028     $self->{set_nc}->($self);
2029     }
2030    
2031     redo A;
2032     }
2033     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2034     ## (only happen if PCDATA state)
2035    
2036     if ($self->{nc} == 0x002D) { # -
2037    
2038     $self->{state} = MD_HYPHEN_STATE;
2039    
2040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2041     $self->{line_prev} = $self->{line};
2042     $self->{column_prev} = $self->{column};
2043     $self->{column}++;
2044     $self->{nc}
2045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2046     } else {
2047     $self->{set_nc}->($self);
2048     }
2049    
2050     redo A;
2051     } elsif ($self->{nc} == 0x0044 or # D
2052     $self->{nc} == 0x0064) { # d
2053     ## ASCII case-insensitive.
2054    
2055     $self->{state} = MD_DOCTYPE_STATE;
2056     $self->{s_kwd} = chr $self->{nc};
2057    
2058     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2059     $self->{line_prev} = $self->{line};
2060     $self->{column_prev} = $self->{column};
2061     $self->{column}++;
2062     $self->{nc}
2063     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2064     } else {
2065     $self->{set_nc}->($self);
2066     }
2067    
2068     redo A;
2069 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2070     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2071     $self->{is_xml}) and
2072 wakaba 1.1 $self->{nc} == 0x005B) { # [
2073    
2074     $self->{state} = MD_CDATA_STATE;
2075     $self->{s_kwd} = '[';
2076    
2077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2078     $self->{line_prev} = $self->{line};
2079     $self->{column_prev} = $self->{column};
2080     $self->{column}++;
2081     $self->{nc}
2082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2083     } else {
2084     $self->{set_nc}->($self);
2085     }
2086    
2087     redo A;
2088     } else {
2089    
2090     }
2091    
2092     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2093     line => $self->{line_prev},
2094     column => $self->{column_prev} - 1);
2095     ## Reconsume.
2096     $self->{state} = BOGUS_COMMENT_STATE;
2097     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2098     line => $self->{line_prev},
2099     column => $self->{column_prev} - 1,
2100     };
2101     redo A;
2102     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2103     if ($self->{nc} == 0x002D) { # -
2104    
2105     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2106     line => $self->{line_prev},
2107     column => $self->{column_prev} - 2,
2108     };
2109     $self->{state} = COMMENT_START_STATE;
2110    
2111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2112     $self->{line_prev} = $self->{line};
2113     $self->{column_prev} = $self->{column};
2114     $self->{column}++;
2115     $self->{nc}
2116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2117     } else {
2118     $self->{set_nc}->($self);
2119     }
2120    
2121     redo A;
2122     } else {
2123    
2124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2125     line => $self->{line_prev},
2126     column => $self->{column_prev} - 2);
2127     $self->{state} = BOGUS_COMMENT_STATE;
2128     ## Reconsume.
2129     $self->{ct} = {type => COMMENT_TOKEN,
2130     data => '-',
2131     line => $self->{line_prev},
2132     column => $self->{column_prev} - 2,
2133     };
2134     redo A;
2135     }
2136     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2137     ## ASCII case-insensitive.
2138     if ($self->{nc} == [
2139     undef,
2140     0x004F, # O
2141     0x0043, # C
2142     0x0054, # T
2143     0x0059, # Y
2144     0x0050, # P
2145     ]->[length $self->{s_kwd}] or
2146     $self->{nc} == [
2147     undef,
2148     0x006F, # o
2149     0x0063, # c
2150     0x0074, # t
2151     0x0079, # y
2152     0x0070, # p
2153     ]->[length $self->{s_kwd}]) {
2154    
2155     ## Stay in the state.
2156     $self->{s_kwd} .= chr $self->{nc};
2157    
2158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159     $self->{line_prev} = $self->{line};
2160     $self->{column_prev} = $self->{column};
2161     $self->{column}++;
2162     $self->{nc}
2163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164     } else {
2165     $self->{set_nc}->($self);
2166     }
2167    
2168     redo A;
2169     } elsif ((length $self->{s_kwd}) == 6 and
2170     ($self->{nc} == 0x0045 or # E
2171     $self->{nc} == 0x0065)) { # e
2172    
2173     $self->{state} = DOCTYPE_STATE;
2174     $self->{ct} = {type => DOCTYPE_TOKEN,
2175     quirks => 1,
2176     line => $self->{line_prev},
2177     column => $self->{column_prev} - 7,
2178     };
2179    
2180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2181     $self->{line_prev} = $self->{line};
2182     $self->{column_prev} = $self->{column};
2183     $self->{column}++;
2184     $self->{nc}
2185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2186     } else {
2187     $self->{set_nc}->($self);
2188     }
2189    
2190     redo A;
2191     } else {
2192    
2193     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2194     line => $self->{line_prev},
2195     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2196     $self->{state} = BOGUS_COMMENT_STATE;
2197     ## Reconsume.
2198     $self->{ct} = {type => COMMENT_TOKEN,
2199     data => $self->{s_kwd},
2200     line => $self->{line_prev},
2201     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2202     };
2203     redo A;
2204     }
2205     } elsif ($self->{state} == MD_CDATA_STATE) {
2206     if ($self->{nc} == {
2207     '[' => 0x0043, # C
2208     '[C' => 0x0044, # D
2209     '[CD' => 0x0041, # A
2210     '[CDA' => 0x0054, # T
2211     '[CDAT' => 0x0041, # A
2212     }->{$self->{s_kwd}}) {
2213    
2214     ## Stay in the state.
2215     $self->{s_kwd} .= chr $self->{nc};
2216    
2217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218     $self->{line_prev} = $self->{line};
2219     $self->{column_prev} = $self->{column};
2220     $self->{column}++;
2221     $self->{nc}
2222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223     } else {
2224     $self->{set_nc}->($self);
2225     }
2226    
2227     redo A;
2228     } elsif ($self->{s_kwd} eq '[CDATA' and
2229     $self->{nc} == 0x005B) { # [
2230    
2231     $self->{ct} = {type => CHARACTER_TOKEN,
2232     data => '',
2233     line => $self->{line_prev},
2234     column => $self->{column_prev} - 7};
2235     $self->{state} = CDATA_SECTION_STATE;
2236    
2237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2238     $self->{line_prev} = $self->{line};
2239     $self->{column_prev} = $self->{column};
2240     $self->{column}++;
2241     $self->{nc}
2242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2243     } else {
2244     $self->{set_nc}->($self);
2245     }
2246    
2247     redo A;
2248     } else {
2249    
2250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2251     line => $self->{line_prev},
2252     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2253     $self->{state} = BOGUS_COMMENT_STATE;
2254     ## Reconsume.
2255     $self->{ct} = {type => COMMENT_TOKEN,
2256     data => $self->{s_kwd},
2257     line => $self->{line_prev},
2258     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2259     };
2260     redo A;
2261     }
2262     } elsif ($self->{state} == COMMENT_START_STATE) {
2263     if ($self->{nc} == 0x002D) { # -
2264    
2265     $self->{state} = COMMENT_START_DASH_STATE;
2266    
2267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2268     $self->{line_prev} = $self->{line};
2269     $self->{column_prev} = $self->{column};
2270     $self->{column}++;
2271     $self->{nc}
2272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2273     } else {
2274     $self->{set_nc}->($self);
2275     }
2276    
2277     redo A;
2278     } elsif ($self->{nc} == 0x003E) { # >
2279    
2280     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2281     $self->{state} = DATA_STATE;
2282 wakaba 1.5 $self->{s_kwd} = '';
2283 wakaba 1.1
2284     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2285     $self->{line_prev} = $self->{line};
2286     $self->{column_prev} = $self->{column};
2287     $self->{column}++;
2288     $self->{nc}
2289     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2290     } else {
2291     $self->{set_nc}->($self);
2292     }
2293    
2294    
2295     return ($self->{ct}); # comment
2296    
2297     redo A;
2298     } elsif ($self->{nc} == -1) {
2299    
2300     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2301     $self->{state} = DATA_STATE;
2302 wakaba 1.5 $self->{s_kwd} = '';
2303 wakaba 1.1 ## reconsume
2304    
2305     return ($self->{ct}); # comment
2306    
2307     redo A;
2308     } else {
2309    
2310     $self->{ct}->{data} # comment
2311     .= chr ($self->{nc});
2312     $self->{state} = COMMENT_STATE;
2313    
2314     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2315     $self->{line_prev} = $self->{line};
2316     $self->{column_prev} = $self->{column};
2317     $self->{column}++;
2318     $self->{nc}
2319     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2320     } else {
2321     $self->{set_nc}->($self);
2322     }
2323    
2324     redo A;
2325     }
2326     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2327     if ($self->{nc} == 0x002D) { # -
2328    
2329     $self->{state} = COMMENT_END_STATE;
2330    
2331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2332     $self->{line_prev} = $self->{line};
2333     $self->{column_prev} = $self->{column};
2334     $self->{column}++;
2335     $self->{nc}
2336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2337     } else {
2338     $self->{set_nc}->($self);
2339     }
2340    
2341     redo A;
2342     } elsif ($self->{nc} == 0x003E) { # >
2343    
2344     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2345     $self->{state} = DATA_STATE;
2346 wakaba 1.5 $self->{s_kwd} = '';
2347 wakaba 1.1
2348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2349     $self->{line_prev} = $self->{line};
2350     $self->{column_prev} = $self->{column};
2351     $self->{column}++;
2352     $self->{nc}
2353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2354     } else {
2355     $self->{set_nc}->($self);
2356     }
2357    
2358    
2359     return ($self->{ct}); # comment
2360    
2361     redo A;
2362     } elsif ($self->{nc} == -1) {
2363    
2364     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2365     $self->{state} = DATA_STATE;
2366 wakaba 1.5 $self->{s_kwd} = '';
2367 wakaba 1.1 ## reconsume
2368    
2369     return ($self->{ct}); # comment
2370    
2371     redo A;
2372     } else {
2373    
2374     $self->{ct}->{data} # comment
2375     .= '-' . chr ($self->{nc});
2376     $self->{state} = COMMENT_STATE;
2377    
2378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2379     $self->{line_prev} = $self->{line};
2380     $self->{column_prev} = $self->{column};
2381     $self->{column}++;
2382     $self->{nc}
2383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2384     } else {
2385     $self->{set_nc}->($self);
2386     }
2387    
2388     redo A;
2389     }
2390     } elsif ($self->{state} == COMMENT_STATE) {
2391     if ($self->{nc} == 0x002D) { # -
2392    
2393     $self->{state} = COMMENT_END_DASH_STATE;
2394    
2395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2396     $self->{line_prev} = $self->{line};
2397     $self->{column_prev} = $self->{column};
2398     $self->{column}++;
2399     $self->{nc}
2400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2401     } else {
2402     $self->{set_nc}->($self);
2403     }
2404    
2405     redo A;
2406     } elsif ($self->{nc} == -1) {
2407    
2408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2409     $self->{state} = DATA_STATE;
2410 wakaba 1.5 $self->{s_kwd} = '';
2411 wakaba 1.1 ## reconsume
2412    
2413     return ($self->{ct}); # comment
2414    
2415     redo A;
2416     } else {
2417    
2418     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2419     $self->{read_until}->($self->{ct}->{data},
2420     q[-],
2421     length $self->{ct}->{data});
2422    
2423     ## Stay in the state
2424    
2425     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2426     $self->{line_prev} = $self->{line};
2427     $self->{column_prev} = $self->{column};
2428     $self->{column}++;
2429     $self->{nc}
2430     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2431     } else {
2432     $self->{set_nc}->($self);
2433     }
2434    
2435     redo A;
2436     }
2437     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2438     if ($self->{nc} == 0x002D) { # -
2439    
2440     $self->{state} = COMMENT_END_STATE;
2441    
2442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2443     $self->{line_prev} = $self->{line};
2444     $self->{column_prev} = $self->{column};
2445     $self->{column}++;
2446     $self->{nc}
2447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2448     } else {
2449     $self->{set_nc}->($self);
2450     }
2451    
2452     redo A;
2453     } elsif ($self->{nc} == -1) {
2454    
2455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2456 wakaba 1.5 $self->{s_kwd} = '';
2457 wakaba 1.1 $self->{state} = DATA_STATE;
2458 wakaba 1.5 $self->{s_kwd} = '';
2459 wakaba 1.1 ## reconsume
2460    
2461     return ($self->{ct}); # comment
2462    
2463     redo A;
2464     } else {
2465    
2466     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2467     $self->{state} = COMMENT_STATE;
2468    
2469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470     $self->{line_prev} = $self->{line};
2471     $self->{column_prev} = $self->{column};
2472     $self->{column}++;
2473     $self->{nc}
2474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475     } else {
2476     $self->{set_nc}->($self);
2477     }
2478    
2479     redo A;
2480     }
2481     } elsif ($self->{state} == COMMENT_END_STATE) {
2482     if ($self->{nc} == 0x003E) { # >
2483    
2484     $self->{state} = DATA_STATE;
2485 wakaba 1.5 $self->{s_kwd} = '';
2486 wakaba 1.1
2487     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488     $self->{line_prev} = $self->{line};
2489     $self->{column_prev} = $self->{column};
2490     $self->{column}++;
2491     $self->{nc}
2492     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2493     } else {
2494     $self->{set_nc}->($self);
2495     }
2496    
2497    
2498     return ($self->{ct}); # comment
2499    
2500     redo A;
2501     } elsif ($self->{nc} == 0x002D) { # -
2502    
2503     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2504     line => $self->{line_prev},
2505     column => $self->{column_prev});
2506     $self->{ct}->{data} .= '-'; # comment
2507     ## Stay in the state
2508    
2509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2510     $self->{line_prev} = $self->{line};
2511     $self->{column_prev} = $self->{column};
2512     $self->{column}++;
2513     $self->{nc}
2514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2515     } else {
2516     $self->{set_nc}->($self);
2517     }
2518    
2519     redo A;
2520     } elsif ($self->{nc} == -1) {
2521    
2522     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2523     $self->{state} = DATA_STATE;
2524 wakaba 1.5 $self->{s_kwd} = '';
2525 wakaba 1.1 ## reconsume
2526    
2527     return ($self->{ct}); # comment
2528    
2529     redo A;
2530     } else {
2531    
2532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2533     line => $self->{line_prev},
2534     column => $self->{column_prev});
2535     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2536     $self->{state} = COMMENT_STATE;
2537    
2538     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2539     $self->{line_prev} = $self->{line};
2540     $self->{column_prev} = $self->{column};
2541     $self->{column}++;
2542     $self->{nc}
2543     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2544     } else {
2545     $self->{set_nc}->($self);
2546     }
2547    
2548     redo A;
2549     }
2550     } elsif ($self->{state} == DOCTYPE_STATE) {
2551     if ($is_space->{$self->{nc}}) {
2552    
2553     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2554    
2555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2556     $self->{line_prev} = $self->{line};
2557     $self->{column_prev} = $self->{column};
2558     $self->{column}++;
2559     $self->{nc}
2560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2561     } else {
2562     $self->{set_nc}->($self);
2563     }
2564    
2565     redo A;
2566     } else {
2567    
2568     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2569     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2570     ## reconsume
2571     redo A;
2572     }
2573     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2574     if ($is_space->{$self->{nc}}) {
2575    
2576     ## Stay in the state
2577    
2578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2579     $self->{line_prev} = $self->{line};
2580     $self->{column_prev} = $self->{column};
2581     $self->{column}++;
2582     $self->{nc}
2583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2584     } else {
2585     $self->{set_nc}->($self);
2586     }
2587    
2588     redo A;
2589     } elsif ($self->{nc} == 0x003E) { # >
2590    
2591     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2592     $self->{state} = DATA_STATE;
2593 wakaba 1.5 $self->{s_kwd} = '';
2594 wakaba 1.1
2595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596     $self->{line_prev} = $self->{line};
2597     $self->{column_prev} = $self->{column};
2598     $self->{column}++;
2599     $self->{nc}
2600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601     } else {
2602     $self->{set_nc}->($self);
2603     }
2604    
2605    
2606     return ($self->{ct}); # DOCTYPE (quirks)
2607    
2608     redo A;
2609     } elsif ($self->{nc} == -1) {
2610    
2611     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2612     $self->{state} = DATA_STATE;
2613 wakaba 1.5 $self->{s_kwd} = '';
2614 wakaba 1.1 ## reconsume
2615    
2616     return ($self->{ct}); # DOCTYPE (quirks)
2617    
2618     redo A;
2619     } else {
2620    
2621     $self->{ct}->{name} = chr $self->{nc};
2622     delete $self->{ct}->{quirks};
2623     $self->{state} = DOCTYPE_NAME_STATE;
2624    
2625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2626     $self->{line_prev} = $self->{line};
2627     $self->{column_prev} = $self->{column};
2628     $self->{column}++;
2629     $self->{nc}
2630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2631     } else {
2632     $self->{set_nc}->($self);
2633     }
2634    
2635     redo A;
2636     }
2637     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2638     ## ISSUE: Redundant "First," in the spec.
2639     if ($is_space->{$self->{nc}}) {
2640    
2641     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2642    
2643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2644     $self->{line_prev} = $self->{line};
2645     $self->{column_prev} = $self->{column};
2646     $self->{column}++;
2647     $self->{nc}
2648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2649     } else {
2650     $self->{set_nc}->($self);
2651     }
2652    
2653     redo A;
2654     } elsif ($self->{nc} == 0x003E) { # >
2655    
2656     $self->{state} = DATA_STATE;
2657 wakaba 1.5 $self->{s_kwd} = '';
2658 wakaba 1.1
2659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2660     $self->{line_prev} = $self->{line};
2661     $self->{column_prev} = $self->{column};
2662     $self->{column}++;
2663     $self->{nc}
2664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2665     } else {
2666     $self->{set_nc}->($self);
2667     }
2668    
2669    
2670     return ($self->{ct}); # DOCTYPE
2671    
2672     redo A;
2673     } elsif ($self->{nc} == -1) {
2674    
2675     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2676     $self->{state} = DATA_STATE;
2677 wakaba 1.5 $self->{s_kwd} = '';
2678 wakaba 1.1 ## reconsume
2679    
2680     $self->{ct}->{quirks} = 1;
2681     return ($self->{ct}); # DOCTYPE
2682    
2683     redo A;
2684     } else {
2685    
2686     $self->{ct}->{name}
2687     .= chr ($self->{nc}); # DOCTYPE
2688     ## Stay in the state
2689    
2690     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2691     $self->{line_prev} = $self->{line};
2692     $self->{column_prev} = $self->{column};
2693     $self->{column}++;
2694     $self->{nc}
2695     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2696     } else {
2697     $self->{set_nc}->($self);
2698     }
2699    
2700     redo A;
2701     }
2702     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2703     if ($is_space->{$self->{nc}}) {
2704    
2705     ## Stay in the state
2706    
2707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2708     $self->{line_prev} = $self->{line};
2709     $self->{column_prev} = $self->{column};
2710     $self->{column}++;
2711     $self->{nc}
2712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2713     } else {
2714     $self->{set_nc}->($self);
2715     }
2716    
2717     redo A;
2718     } elsif ($self->{nc} == 0x003E) { # >
2719    
2720     $self->{state} = DATA_STATE;
2721 wakaba 1.5 $self->{s_kwd} = '';
2722 wakaba 1.1
2723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2724     $self->{line_prev} = $self->{line};
2725     $self->{column_prev} = $self->{column};
2726     $self->{column}++;
2727     $self->{nc}
2728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2729     } else {
2730     $self->{set_nc}->($self);
2731     }
2732    
2733    
2734     return ($self->{ct}); # DOCTYPE
2735    
2736     redo A;
2737     } elsif ($self->{nc} == -1) {
2738    
2739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2740     $self->{state} = DATA_STATE;
2741 wakaba 1.5 $self->{s_kwd} = '';
2742 wakaba 1.1 ## reconsume
2743    
2744     $self->{ct}->{quirks} = 1;
2745     return ($self->{ct}); # DOCTYPE
2746    
2747     redo A;
2748     } elsif ($self->{nc} == 0x0050 or # P
2749     $self->{nc} == 0x0070) { # p
2750     $self->{state} = PUBLIC_STATE;
2751     $self->{s_kwd} = chr $self->{nc};
2752    
2753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2754     $self->{line_prev} = $self->{line};
2755     $self->{column_prev} = $self->{column};
2756     $self->{column}++;
2757     $self->{nc}
2758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2759     } else {
2760     $self->{set_nc}->($self);
2761     }
2762    
2763     redo A;
2764     } elsif ($self->{nc} == 0x0053 or # S
2765     $self->{nc} == 0x0073) { # s
2766     $self->{state} = SYSTEM_STATE;
2767     $self->{s_kwd} = chr $self->{nc};
2768    
2769     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2770     $self->{line_prev} = $self->{line};
2771     $self->{column_prev} = $self->{column};
2772     $self->{column}++;
2773     $self->{nc}
2774     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2775     } else {
2776     $self->{set_nc}->($self);
2777     }
2778    
2779     redo A;
2780     } else {
2781    
2782     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2783     $self->{ct}->{quirks} = 1;
2784    
2785     $self->{state} = BOGUS_DOCTYPE_STATE;
2786    
2787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788     $self->{line_prev} = $self->{line};
2789     $self->{column_prev} = $self->{column};
2790     $self->{column}++;
2791     $self->{nc}
2792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793     } else {
2794     $self->{set_nc}->($self);
2795     }
2796    
2797     redo A;
2798     }
2799     } elsif ($self->{state} == PUBLIC_STATE) {
2800     ## ASCII case-insensitive
2801     if ($self->{nc} == [
2802     undef,
2803     0x0055, # U
2804     0x0042, # B
2805     0x004C, # L
2806     0x0049, # I
2807     ]->[length $self->{s_kwd}] or
2808     $self->{nc} == [
2809     undef,
2810     0x0075, # u
2811     0x0062, # b
2812     0x006C, # l
2813     0x0069, # i
2814     ]->[length $self->{s_kwd}]) {
2815    
2816     ## Stay in the state.
2817     $self->{s_kwd} .= chr $self->{nc};
2818    
2819     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2820     $self->{line_prev} = $self->{line};
2821     $self->{column_prev} = $self->{column};
2822     $self->{column}++;
2823     $self->{nc}
2824     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2825     } else {
2826     $self->{set_nc}->($self);
2827     }
2828    
2829     redo A;
2830     } elsif ((length $self->{s_kwd}) == 5 and
2831     ($self->{nc} == 0x0043 or # C
2832     $self->{nc} == 0x0063)) { # c
2833    
2834     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2835    
2836     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2837     $self->{line_prev} = $self->{line};
2838     $self->{column_prev} = $self->{column};
2839     $self->{column}++;
2840     $self->{nc}
2841     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2842     } else {
2843     $self->{set_nc}->($self);
2844     }
2845    
2846     redo A;
2847     } else {
2848    
2849     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2850     line => $self->{line_prev},
2851     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2852     $self->{ct}->{quirks} = 1;
2853    
2854     $self->{state} = BOGUS_DOCTYPE_STATE;
2855     ## Reconsume.
2856     redo A;
2857     }
2858     } elsif ($self->{state} == SYSTEM_STATE) {
2859     ## ASCII case-insensitive
2860     if ($self->{nc} == [
2861     undef,
2862     0x0059, # Y
2863     0x0053, # S
2864     0x0054, # T
2865     0x0045, # E
2866     ]->[length $self->{s_kwd}] or
2867     $self->{nc} == [
2868     undef,
2869     0x0079, # y
2870     0x0073, # s
2871     0x0074, # t
2872     0x0065, # e
2873     ]->[length $self->{s_kwd}]) {
2874    
2875     ## Stay in the state.
2876     $self->{s_kwd} .= chr $self->{nc};
2877    
2878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2879     $self->{line_prev} = $self->{line};
2880     $self->{column_prev} = $self->{column};
2881     $self->{column}++;
2882     $self->{nc}
2883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2884     } else {
2885     $self->{set_nc}->($self);
2886     }
2887    
2888     redo A;
2889     } elsif ((length $self->{s_kwd}) == 5 and
2890     ($self->{nc} == 0x004D or # M
2891     $self->{nc} == 0x006D)) { # m
2892    
2893     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2894    
2895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2896     $self->{line_prev} = $self->{line};
2897     $self->{column_prev} = $self->{column};
2898     $self->{column}++;
2899     $self->{nc}
2900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2901     } else {
2902     $self->{set_nc}->($self);
2903     }
2904    
2905     redo A;
2906     } else {
2907    
2908     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2909     line => $self->{line_prev},
2910     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2911     $self->{ct}->{quirks} = 1;
2912    
2913     $self->{state} = BOGUS_DOCTYPE_STATE;
2914     ## Reconsume.
2915     redo A;
2916     }
2917     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2918     if ($is_space->{$self->{nc}}) {
2919    
2920     ## Stay in the state
2921    
2922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2923     $self->{line_prev} = $self->{line};
2924     $self->{column_prev} = $self->{column};
2925     $self->{column}++;
2926     $self->{nc}
2927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2928     } else {
2929     $self->{set_nc}->($self);
2930     }
2931    
2932     redo A;
2933     } elsif ($self->{nc} eq 0x0022) { # "
2934    
2935     $self->{ct}->{pubid} = ''; # DOCTYPE
2936     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2937    
2938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2939     $self->{line_prev} = $self->{line};
2940     $self->{column_prev} = $self->{column};
2941     $self->{column}++;
2942     $self->{nc}
2943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2944     } else {
2945     $self->{set_nc}->($self);
2946     }
2947    
2948     redo A;
2949     } elsif ($self->{nc} eq 0x0027) { # '
2950    
2951     $self->{ct}->{pubid} = ''; # DOCTYPE
2952     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2953    
2954     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2955     $self->{line_prev} = $self->{line};
2956     $self->{column_prev} = $self->{column};
2957     $self->{column}++;
2958     $self->{nc}
2959     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2960     } else {
2961     $self->{set_nc}->($self);
2962     }
2963    
2964     redo A;
2965     } elsif ($self->{nc} eq 0x003E) { # >
2966    
2967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2968    
2969     $self->{state} = DATA_STATE;
2970 wakaba 1.5 $self->{s_kwd} = '';
2971 wakaba 1.1
2972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2973     $self->{line_prev} = $self->{line};
2974     $self->{column_prev} = $self->{column};
2975     $self->{column}++;
2976     $self->{nc}
2977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2978     } else {
2979     $self->{set_nc}->($self);
2980     }
2981    
2982    
2983     $self->{ct}->{quirks} = 1;
2984     return ($self->{ct}); # DOCTYPE
2985    
2986     redo A;
2987     } elsif ($self->{nc} == -1) {
2988    
2989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2990    
2991     $self->{state} = DATA_STATE;
2992 wakaba 1.5 $self->{s_kwd} = '';
2993 wakaba 1.1 ## reconsume
2994    
2995     $self->{ct}->{quirks} = 1;
2996     return ($self->{ct}); # DOCTYPE
2997    
2998     redo A;
2999     } else {
3000    
3001     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3002     $self->{ct}->{quirks} = 1;
3003    
3004     $self->{state} = BOGUS_DOCTYPE_STATE;
3005    
3006     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007     $self->{line_prev} = $self->{line};
3008     $self->{column_prev} = $self->{column};
3009     $self->{column}++;
3010     $self->{nc}
3011     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3012     } else {
3013     $self->{set_nc}->($self);
3014     }
3015    
3016     redo A;
3017     }
3018     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3019     if ($self->{nc} == 0x0022) { # "
3020    
3021     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3022    
3023     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3024     $self->{line_prev} = $self->{line};
3025     $self->{column_prev} = $self->{column};
3026     $self->{column}++;
3027     $self->{nc}
3028     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3029     } else {
3030     $self->{set_nc}->($self);
3031     }
3032    
3033     redo A;
3034     } elsif ($self->{nc} == 0x003E) { # >
3035    
3036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3037    
3038     $self->{state} = DATA_STATE;
3039 wakaba 1.5 $self->{s_kwd} = '';
3040 wakaba 1.1
3041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042     $self->{line_prev} = $self->{line};
3043     $self->{column_prev} = $self->{column};
3044     $self->{column}++;
3045     $self->{nc}
3046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047     } else {
3048     $self->{set_nc}->($self);
3049     }
3050    
3051    
3052     $self->{ct}->{quirks} = 1;
3053     return ($self->{ct}); # DOCTYPE
3054    
3055     redo A;
3056     } elsif ($self->{nc} == -1) {
3057    
3058     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3059    
3060     $self->{state} = DATA_STATE;
3061 wakaba 1.5 $self->{s_kwd} = '';
3062 wakaba 1.1 ## reconsume
3063    
3064     $self->{ct}->{quirks} = 1;
3065     return ($self->{ct}); # DOCTYPE
3066    
3067     redo A;
3068     } else {
3069    
3070     $self->{ct}->{pubid} # DOCTYPE
3071     .= chr $self->{nc};
3072     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3073     length $self->{ct}->{pubid});
3074    
3075     ## Stay in the state
3076    
3077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3078     $self->{line_prev} = $self->{line};
3079     $self->{column_prev} = $self->{column};
3080     $self->{column}++;
3081     $self->{nc}
3082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3083     } else {
3084     $self->{set_nc}->($self);
3085     }
3086    
3087     redo A;
3088     }
3089     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3090     if ($self->{nc} == 0x0027) { # '
3091    
3092     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3093    
3094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3095     $self->{line_prev} = $self->{line};
3096     $self->{column_prev} = $self->{column};
3097     $self->{column}++;
3098     $self->{nc}
3099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3100     } else {
3101     $self->{set_nc}->($self);
3102     }
3103    
3104     redo A;
3105     } elsif ($self->{nc} == 0x003E) { # >
3106    
3107     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3108    
3109     $self->{state} = DATA_STATE;
3110 wakaba 1.5 $self->{s_kwd} = '';
3111 wakaba 1.1
3112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3113     $self->{line_prev} = $self->{line};
3114     $self->{column_prev} = $self->{column};
3115     $self->{column}++;
3116     $self->{nc}
3117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3118     } else {
3119     $self->{set_nc}->($self);
3120     }
3121    
3122    
3123     $self->{ct}->{quirks} = 1;
3124     return ($self->{ct}); # DOCTYPE
3125    
3126     redo A;
3127     } elsif ($self->{nc} == -1) {
3128    
3129     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3130    
3131     $self->{state} = DATA_STATE;
3132 wakaba 1.5 $self->{s_kwd} = '';
3133 wakaba 1.1 ## reconsume
3134    
3135     $self->{ct}->{quirks} = 1;
3136     return ($self->{ct}); # DOCTYPE
3137    
3138     redo A;
3139     } else {
3140    
3141     $self->{ct}->{pubid} # DOCTYPE
3142     .= chr $self->{nc};
3143     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3144     length $self->{ct}->{pubid});
3145    
3146     ## Stay in the state
3147    
3148     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3149     $self->{line_prev} = $self->{line};
3150     $self->{column_prev} = $self->{column};
3151     $self->{column}++;
3152     $self->{nc}
3153     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3154     } else {
3155     $self->{set_nc}->($self);
3156     }
3157    
3158     redo A;
3159     }
3160     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3161     if ($is_space->{$self->{nc}}) {
3162    
3163     ## Stay in the state
3164    
3165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3166     $self->{line_prev} = $self->{line};
3167     $self->{column_prev} = $self->{column};
3168     $self->{column}++;
3169     $self->{nc}
3170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3171     } else {
3172     $self->{set_nc}->($self);
3173     }
3174    
3175     redo A;
3176     } elsif ($self->{nc} == 0x0022) { # "
3177    
3178     $self->{ct}->{sysid} = ''; # DOCTYPE
3179     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3180    
3181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3182     $self->{line_prev} = $self->{line};
3183     $self->{column_prev} = $self->{column};
3184     $self->{column}++;
3185     $self->{nc}
3186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3187     } else {
3188     $self->{set_nc}->($self);
3189     }
3190    
3191     redo A;
3192     } elsif ($self->{nc} == 0x0027) { # '
3193    
3194     $self->{ct}->{sysid} = ''; # DOCTYPE
3195     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3196    
3197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3198     $self->{line_prev} = $self->{line};
3199     $self->{column_prev} = $self->{column};
3200     $self->{column}++;
3201     $self->{nc}
3202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3203     } else {
3204     $self->{set_nc}->($self);
3205     }
3206    
3207     redo A;
3208     } elsif ($self->{nc} == 0x003E) { # >
3209    
3210     $self->{state} = DATA_STATE;
3211 wakaba 1.5 $self->{s_kwd} = '';
3212 wakaba 1.1
3213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214     $self->{line_prev} = $self->{line};
3215     $self->{column_prev} = $self->{column};
3216     $self->{column}++;
3217     $self->{nc}
3218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219     } else {
3220     $self->{set_nc}->($self);
3221     }
3222    
3223    
3224     return ($self->{ct}); # DOCTYPE
3225    
3226     redo A;
3227     } elsif ($self->{nc} == -1) {
3228    
3229     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3230    
3231     $self->{state} = DATA_STATE;
3232 wakaba 1.5 $self->{s_kwd} = '';
3233 wakaba 1.1 ## reconsume
3234    
3235     $self->{ct}->{quirks} = 1;
3236     return ($self->{ct}); # DOCTYPE
3237    
3238     redo A;
3239     } else {
3240    
3241     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3242     $self->{ct}->{quirks} = 1;
3243    
3244     $self->{state} = BOGUS_DOCTYPE_STATE;
3245    
3246     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3247     $self->{line_prev} = $self->{line};
3248     $self->{column_prev} = $self->{column};
3249     $self->{column}++;
3250     $self->{nc}
3251     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3252     } else {
3253     $self->{set_nc}->($self);
3254     }
3255    
3256     redo A;
3257     }
3258     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3259     if ($is_space->{$self->{nc}}) {
3260    
3261     ## Stay in the state
3262    
3263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264     $self->{line_prev} = $self->{line};
3265     $self->{column_prev} = $self->{column};
3266     $self->{column}++;
3267     $self->{nc}
3268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269     } else {
3270     $self->{set_nc}->($self);
3271     }
3272    
3273     redo A;
3274     } elsif ($self->{nc} == 0x0022) { # "
3275    
3276     $self->{ct}->{sysid} = ''; # DOCTYPE
3277     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3278    
3279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3280     $self->{line_prev} = $self->{line};
3281     $self->{column_prev} = $self->{column};
3282     $self->{column}++;
3283     $self->{nc}
3284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3285     } else {
3286     $self->{set_nc}->($self);
3287     }
3288    
3289     redo A;
3290     } elsif ($self->{nc} == 0x0027) { # '
3291    
3292     $self->{ct}->{sysid} = ''; # DOCTYPE
3293     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3294    
3295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3296     $self->{line_prev} = $self->{line};
3297     $self->{column_prev} = $self->{column};
3298     $self->{column}++;
3299     $self->{nc}
3300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3301     } else {
3302     $self->{set_nc}->($self);
3303     }
3304    
3305     redo A;
3306     } elsif ($self->{nc} == 0x003E) { # >
3307    
3308     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3309     $self->{state} = DATA_STATE;
3310 wakaba 1.5 $self->{s_kwd} = '';
3311 wakaba 1.1
3312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3313     $self->{line_prev} = $self->{line};
3314     $self->{column_prev} = $self->{column};
3315     $self->{column}++;
3316     $self->{nc}
3317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3318     } else {
3319     $self->{set_nc}->($self);
3320     }
3321    
3322    
3323     $self->{ct}->{quirks} = 1;
3324     return ($self->{ct}); # DOCTYPE
3325    
3326     redo A;
3327     } elsif ($self->{nc} == -1) {
3328    
3329     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3330    
3331     $self->{state} = DATA_STATE;
3332 wakaba 1.5 $self->{s_kwd} = '';
3333 wakaba 1.1 ## reconsume
3334    
3335     $self->{ct}->{quirks} = 1;
3336     return ($self->{ct}); # DOCTYPE
3337    
3338     redo A;
3339     } else {
3340    
3341     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3342     $self->{ct}->{quirks} = 1;
3343    
3344     $self->{state} = BOGUS_DOCTYPE_STATE;
3345    
3346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3347     $self->{line_prev} = $self->{line};
3348     $self->{column_prev} = $self->{column};
3349     $self->{column}++;
3350     $self->{nc}
3351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3352     } else {
3353     $self->{set_nc}->($self);
3354     }
3355    
3356     redo A;
3357     }
3358     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3359     if ($self->{nc} == 0x0022) { # "
3360    
3361     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3362    
3363     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3364     $self->{line_prev} = $self->{line};
3365     $self->{column_prev} = $self->{column};
3366     $self->{column}++;
3367     $self->{nc}
3368     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3369     } else {
3370     $self->{set_nc}->($self);
3371     }
3372    
3373     redo A;
3374     } elsif ($self->{nc} == 0x003E) { # >
3375    
3376     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3377    
3378     $self->{state} = DATA_STATE;
3379 wakaba 1.5 $self->{s_kwd} = '';
3380 wakaba 1.1
3381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3382     $self->{line_prev} = $self->{line};
3383     $self->{column_prev} = $self->{column};
3384     $self->{column}++;
3385     $self->{nc}
3386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3387     } else {
3388     $self->{set_nc}->($self);
3389     }
3390    
3391    
3392     $self->{ct}->{quirks} = 1;
3393     return ($self->{ct}); # DOCTYPE
3394    
3395     redo A;
3396     } elsif ($self->{nc} == -1) {
3397    
3398     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3399    
3400     $self->{state} = DATA_STATE;
3401 wakaba 1.5 $self->{s_kwd} = '';
3402 wakaba 1.1 ## reconsume
3403    
3404     $self->{ct}->{quirks} = 1;
3405     return ($self->{ct}); # DOCTYPE
3406    
3407     redo A;
3408     } else {
3409    
3410     $self->{ct}->{sysid} # DOCTYPE
3411     .= chr $self->{nc};
3412     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3413     length $self->{ct}->{sysid});
3414    
3415     ## Stay in the state
3416    
3417     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3418     $self->{line_prev} = $self->{line};
3419     $self->{column_prev} = $self->{column};
3420     $self->{column}++;
3421     $self->{nc}
3422     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3423     } else {
3424     $self->{set_nc}->($self);
3425     }
3426    
3427     redo A;
3428     }
3429     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3430     if ($self->{nc} == 0x0027) { # '
3431    
3432     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3433    
3434     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3435     $self->{line_prev} = $self->{line};
3436     $self->{column_prev} = $self->{column};
3437     $self->{column}++;
3438     $self->{nc}
3439     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3440     } else {
3441     $self->{set_nc}->($self);
3442     }
3443    
3444     redo A;
3445     } elsif ($self->{nc} == 0x003E) { # >
3446    
3447     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3448    
3449     $self->{state} = DATA_STATE;
3450 wakaba 1.5 $self->{s_kwd} = '';
3451 wakaba 1.1
3452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3453     $self->{line_prev} = $self->{line};
3454     $self->{column_prev} = $self->{column};
3455     $self->{column}++;
3456     $self->{nc}
3457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3458     } else {
3459     $self->{set_nc}->($self);
3460     }
3461    
3462    
3463     $self->{ct}->{quirks} = 1;
3464     return ($self->{ct}); # DOCTYPE
3465    
3466     redo A;
3467     } elsif ($self->{nc} == -1) {
3468    
3469     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3470    
3471     $self->{state} = DATA_STATE;
3472 wakaba 1.5 $self->{s_kwd} = '';
3473 wakaba 1.1 ## reconsume
3474    
3475     $self->{ct}->{quirks} = 1;
3476     return ($self->{ct}); # DOCTYPE
3477    
3478     redo A;
3479     } else {
3480    
3481     $self->{ct}->{sysid} # DOCTYPE
3482     .= chr $self->{nc};
3483     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3484     length $self->{ct}->{sysid});
3485    
3486     ## Stay in the state
3487    
3488     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3489     $self->{line_prev} = $self->{line};
3490     $self->{column_prev} = $self->{column};
3491     $self->{column}++;
3492     $self->{nc}
3493     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3494     } else {
3495     $self->{set_nc}->($self);
3496     }
3497    
3498     redo A;
3499     }
3500     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3501     if ($is_space->{$self->{nc}}) {
3502    
3503     ## Stay in the state
3504    
3505     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506     $self->{line_prev} = $self->{line};
3507     $self->{column_prev} = $self->{column};
3508     $self->{column}++;
3509     $self->{nc}
3510     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511     } else {
3512     $self->{set_nc}->($self);
3513     }
3514    
3515     redo A;
3516     } elsif ($self->{nc} == 0x003E) { # >
3517    
3518     $self->{state} = DATA_STATE;
3519 wakaba 1.5 $self->{s_kwd} = '';
3520 wakaba 1.1
3521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522     $self->{line_prev} = $self->{line};
3523     $self->{column_prev} = $self->{column};
3524     $self->{column}++;
3525     $self->{nc}
3526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527     } else {
3528     $self->{set_nc}->($self);
3529     }
3530    
3531    
3532     return ($self->{ct}); # DOCTYPE
3533    
3534     redo A;
3535     } elsif ($self->{nc} == -1) {
3536    
3537     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3538     $self->{state} = DATA_STATE;
3539 wakaba 1.5 $self->{s_kwd} = '';
3540 wakaba 1.1 ## reconsume
3541    
3542     $self->{ct}->{quirks} = 1;
3543     return ($self->{ct}); # DOCTYPE
3544    
3545     redo A;
3546     } else {
3547    
3548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3549     #$self->{ct}->{quirks} = 1;
3550    
3551     $self->{state} = BOGUS_DOCTYPE_STATE;
3552    
3553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3554     $self->{line_prev} = $self->{line};
3555     $self->{column_prev} = $self->{column};
3556     $self->{column}++;
3557     $self->{nc}
3558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3559     } else {
3560     $self->{set_nc}->($self);
3561     }
3562    
3563     redo A;
3564     }
3565     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3566     if ($self->{nc} == 0x003E) { # >
3567    
3568     $self->{state} = DATA_STATE;
3569 wakaba 1.5 $self->{s_kwd} = '';
3570 wakaba 1.1
3571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3572     $self->{line_prev} = $self->{line};
3573     $self->{column_prev} = $self->{column};
3574     $self->{column}++;
3575     $self->{nc}
3576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3577     } else {
3578     $self->{set_nc}->($self);
3579     }
3580    
3581    
3582     return ($self->{ct}); # DOCTYPE
3583    
3584     redo A;
3585     } elsif ($self->{nc} == -1) {
3586    
3587     $self->{state} = DATA_STATE;
3588 wakaba 1.5 $self->{s_kwd} = '';
3589 wakaba 1.1 ## reconsume
3590    
3591     return ($self->{ct}); # DOCTYPE
3592    
3593     redo A;
3594     } else {
3595    
3596     my $s = '';
3597     $self->{read_until}->($s, q[>], 0);
3598    
3599     ## Stay in the state
3600    
3601     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3602     $self->{line_prev} = $self->{line};
3603     $self->{column_prev} = $self->{column};
3604     $self->{column}++;
3605     $self->{nc}
3606     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3607     } else {
3608     $self->{set_nc}->($self);
3609     }
3610    
3611     redo A;
3612     }
3613     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3614     ## NOTE: "CDATA section state" in the state is jointly implemented
3615     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3616     ## and |CDATA_SECTION_MSE2_STATE|.
3617    
3618     if ($self->{nc} == 0x005D) { # ]
3619    
3620     $self->{state} = CDATA_SECTION_MSE1_STATE;
3621    
3622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3623     $self->{line_prev} = $self->{line};
3624     $self->{column_prev} = $self->{column};
3625     $self->{column}++;
3626     $self->{nc}
3627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3628     } else {
3629     $self->{set_nc}->($self);
3630     }
3631    
3632     redo A;
3633     } elsif ($self->{nc} == -1) {
3634     $self->{state} = DATA_STATE;
3635 wakaba 1.5 $self->{s_kwd} = '';
3636 wakaba 1.1
3637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3638     $self->{line_prev} = $self->{line};
3639     $self->{column_prev} = $self->{column};
3640     $self->{column}++;
3641     $self->{nc}
3642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3643     } else {
3644     $self->{set_nc}->($self);
3645     }
3646    
3647     if (length $self->{ct}->{data}) { # character
3648    
3649     return ($self->{ct}); # character
3650     } else {
3651    
3652     ## No token to emit. $self->{ct} is discarded.
3653     }
3654     redo A;
3655     } else {
3656    
3657     $self->{ct}->{data} .= chr $self->{nc};
3658     $self->{read_until}->($self->{ct}->{data},
3659     q<]>,
3660     length $self->{ct}->{data});
3661    
3662     ## Stay in the state.
3663    
3664     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3665     $self->{line_prev} = $self->{line};
3666     $self->{column_prev} = $self->{column};
3667     $self->{column}++;
3668     $self->{nc}
3669     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3670     } else {
3671     $self->{set_nc}->($self);
3672     }
3673    
3674     redo A;
3675     }
3676    
3677     ## ISSUE: "text tokens" in spec.
3678     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3679     if ($self->{nc} == 0x005D) { # ]
3680    
3681     $self->{state} = CDATA_SECTION_MSE2_STATE;
3682    
3683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3684     $self->{line_prev} = $self->{line};
3685     $self->{column_prev} = $self->{column};
3686     $self->{column}++;
3687     $self->{nc}
3688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3689     } else {
3690     $self->{set_nc}->($self);
3691     }
3692    
3693     redo A;
3694     } else {
3695    
3696     $self->{ct}->{data} .= ']';
3697     $self->{state} = CDATA_SECTION_STATE;
3698     ## Reconsume.
3699     redo A;
3700     }
3701     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3702     if ($self->{nc} == 0x003E) { # >
3703     $self->{state} = DATA_STATE;
3704 wakaba 1.5 $self->{s_kwd} = '';
3705 wakaba 1.1
3706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3707     $self->{line_prev} = $self->{line};
3708     $self->{column_prev} = $self->{column};
3709     $self->{column}++;
3710     $self->{nc}
3711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3712     } else {
3713     $self->{set_nc}->($self);
3714     }
3715    
3716     if (length $self->{ct}->{data}) { # character
3717    
3718     return ($self->{ct}); # character
3719     } else {
3720    
3721     ## No token to emit. $self->{ct} is discarded.
3722     }
3723     redo A;
3724     } elsif ($self->{nc} == 0x005D) { # ]
3725     # character
3726     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3727     ## Stay in the state.
3728    
3729     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3730     $self->{line_prev} = $self->{line};
3731     $self->{column_prev} = $self->{column};
3732     $self->{column}++;
3733     $self->{nc}
3734     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3735     } else {
3736     $self->{set_nc}->($self);
3737     }
3738    
3739     redo A;
3740     } else {
3741    
3742     $self->{ct}->{data} .= ']]'; # character
3743     $self->{state} = CDATA_SECTION_STATE;
3744     ## Reconsume.
3745     redo A;
3746     }
3747     } elsif ($self->{state} == ENTITY_STATE) {
3748     if ($is_space->{$self->{nc}} or
3749     {
3750     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3751     $self->{entity_add} => 1,
3752     }->{$self->{nc}}) {
3753    
3754     ## Don't consume
3755     ## No error
3756     ## Return nothing.
3757     #
3758     } elsif ($self->{nc} == 0x0023) { # #
3759    
3760     $self->{state} = ENTITY_HASH_STATE;
3761     $self->{s_kwd} = '#';
3762    
3763     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3764     $self->{line_prev} = $self->{line};
3765     $self->{column_prev} = $self->{column};
3766     $self->{column}++;
3767     $self->{nc}
3768     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3769     } else {
3770     $self->{set_nc}->($self);
3771     }
3772    
3773     redo A;
3774     } elsif ((0x0041 <= $self->{nc} and
3775     $self->{nc} <= 0x005A) or # A..Z
3776     (0x0061 <= $self->{nc} and
3777     $self->{nc} <= 0x007A)) { # a..z
3778    
3779     require Whatpm::_NamedEntityList;
3780     $self->{state} = ENTITY_NAME_STATE;
3781     $self->{s_kwd} = chr $self->{nc};
3782     $self->{entity__value} = $self->{s_kwd};
3783     $self->{entity__match} = 0;
3784    
3785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786     $self->{line_prev} = $self->{line};
3787     $self->{column_prev} = $self->{column};
3788     $self->{column}++;
3789     $self->{nc}
3790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791     } else {
3792     $self->{set_nc}->($self);
3793     }
3794    
3795     redo A;
3796     } else {
3797    
3798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3799     ## Return nothing.
3800     #
3801     }
3802    
3803     ## NOTE: No character is consumed by the "consume a character
3804     ## reference" algorithm. In other word, there is an "&" character
3805     ## that does not introduce a character reference, which would be
3806     ## appended to the parent element or the attribute value in later
3807     ## process of the tokenizer.
3808    
3809     if ($self->{prev_state} == DATA_STATE) {
3810    
3811     $self->{state} = $self->{prev_state};
3812 wakaba 1.5 $self->{s_kwd} = '';
3813 wakaba 1.1 ## Reconsume.
3814     return ({type => CHARACTER_TOKEN, data => '&',
3815     line => $self->{line_prev},
3816     column => $self->{column_prev},
3817     });
3818     redo A;
3819     } else {
3820    
3821     $self->{ca}->{value} .= '&';
3822     $self->{state} = $self->{prev_state};
3823 wakaba 1.5 $self->{s_kwd} = '';
3824 wakaba 1.1 ## Reconsume.
3825     redo A;
3826     }
3827     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3828     if ($self->{nc} == 0x0078 or # x
3829     $self->{nc} == 0x0058) { # X
3830    
3831     $self->{state} = HEXREF_X_STATE;
3832     $self->{s_kwd} .= chr $self->{nc};
3833    
3834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3835     $self->{line_prev} = $self->{line};
3836     $self->{column_prev} = $self->{column};
3837     $self->{column}++;
3838     $self->{nc}
3839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3840     } else {
3841     $self->{set_nc}->($self);
3842     }
3843    
3844     redo A;
3845     } elsif (0x0030 <= $self->{nc} and
3846     $self->{nc} <= 0x0039) { # 0..9
3847    
3848     $self->{state} = NCR_NUM_STATE;
3849     $self->{s_kwd} = $self->{nc} - 0x0030;
3850    
3851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3852     $self->{line_prev} = $self->{line};
3853     $self->{column_prev} = $self->{column};
3854     $self->{column}++;
3855     $self->{nc}
3856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3857     } else {
3858     $self->{set_nc}->($self);
3859     }
3860    
3861     redo A;
3862     } else {
3863     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3864     line => $self->{line_prev},
3865     column => $self->{column_prev} - 1);
3866    
3867     ## NOTE: According to the spec algorithm, nothing is returned,
3868     ## and then "&#" is appended to the parent element or the attribute
3869     ## value in the later processing.
3870    
3871     if ($self->{prev_state} == DATA_STATE) {
3872    
3873     $self->{state} = $self->{prev_state};
3874 wakaba 1.5 $self->{s_kwd} = '';
3875 wakaba 1.1 ## Reconsume.
3876     return ({type => CHARACTER_TOKEN,
3877     data => '&#',
3878     line => $self->{line_prev},
3879     column => $self->{column_prev} - 1,
3880     });
3881     redo A;
3882     } else {
3883    
3884     $self->{ca}->{value} .= '&#';
3885     $self->{state} = $self->{prev_state};
3886 wakaba 1.5 $self->{s_kwd} = '';
3887 wakaba 1.1 ## Reconsume.
3888     redo A;
3889     }
3890     }
3891     } elsif ($self->{state} == NCR_NUM_STATE) {
3892     if (0x0030 <= $self->{nc} and
3893     $self->{nc} <= 0x0039) { # 0..9
3894    
3895     $self->{s_kwd} *= 10;
3896     $self->{s_kwd} += $self->{nc} - 0x0030;
3897    
3898     ## Stay in the state.
3899    
3900     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3901     $self->{line_prev} = $self->{line};
3902     $self->{column_prev} = $self->{column};
3903     $self->{column}++;
3904     $self->{nc}
3905     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3906     } else {
3907     $self->{set_nc}->($self);
3908     }
3909    
3910     redo A;
3911     } elsif ($self->{nc} == 0x003B) { # ;
3912    
3913    
3914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915     $self->{line_prev} = $self->{line};
3916     $self->{column_prev} = $self->{column};
3917     $self->{column}++;
3918     $self->{nc}
3919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920     } else {
3921     $self->{set_nc}->($self);
3922     }
3923    
3924     #
3925     } else {
3926    
3927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3928     ## Reconsume.
3929     #
3930     }
3931    
3932     my $code = $self->{s_kwd};
3933     my $l = $self->{line_prev};
3934     my $c = $self->{column_prev};
3935     if ($charref_map->{$code}) {
3936    
3937     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3938     text => (sprintf 'U+%04X', $code),
3939     line => $l, column => $c);
3940     $code = $charref_map->{$code};
3941     } elsif ($code > 0x10FFFF) {
3942    
3943     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3944     text => (sprintf 'U-%08X', $code),
3945     line => $l, column => $c);
3946     $code = 0xFFFD;
3947     }
3948    
3949     if ($self->{prev_state} == DATA_STATE) {
3950    
3951     $self->{state} = $self->{prev_state};
3952 wakaba 1.5 $self->{s_kwd} = '';
3953 wakaba 1.1 ## Reconsume.
3954     return ({type => CHARACTER_TOKEN, data => chr $code,
3955     line => $l, column => $c,
3956     });
3957     redo A;
3958     } else {
3959    
3960     $self->{ca}->{value} .= chr $code;
3961     $self->{ca}->{has_reference} = 1;
3962     $self->{state} = $self->{prev_state};
3963 wakaba 1.5 $self->{s_kwd} = '';
3964 wakaba 1.1 ## Reconsume.
3965     redo A;
3966     }
3967     } elsif ($self->{state} == HEXREF_X_STATE) {
3968     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3969     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3970     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3971     # 0..9, A..F, a..f
3972    
3973     $self->{state} = HEXREF_HEX_STATE;
3974     $self->{s_kwd} = 0;
3975     ## Reconsume.
3976     redo A;
3977     } else {
3978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3979     line => $self->{line_prev},
3980     column => $self->{column_prev} - 2);
3981    
3982     ## NOTE: According to the spec algorithm, nothing is returned,
3983     ## and then "&#" followed by "X" or "x" is appended to the parent
3984     ## element or the attribute value in the later processing.
3985    
3986     if ($self->{prev_state} == DATA_STATE) {
3987    
3988     $self->{state} = $self->{prev_state};
3989 wakaba 1.5 $self->{s_kwd} = '';
3990 wakaba 1.1 ## Reconsume.
3991     return ({type => CHARACTER_TOKEN,
3992     data => '&' . $self->{s_kwd},
3993     line => $self->{line_prev},
3994     column => $self->{column_prev} - length $self->{s_kwd},
3995     });
3996     redo A;
3997     } else {
3998    
3999     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4000     $self->{state} = $self->{prev_state};
4001 wakaba 1.5 $self->{s_kwd} = '';
4002 wakaba 1.1 ## Reconsume.
4003     redo A;
4004     }
4005     }
4006     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4007     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4008     # 0..9
4009    
4010     $self->{s_kwd} *= 0x10;
4011     $self->{s_kwd} += $self->{nc} - 0x0030;
4012     ## Stay in the state.
4013    
4014     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4015     $self->{line_prev} = $self->{line};
4016     $self->{column_prev} = $self->{column};
4017     $self->{column}++;
4018     $self->{nc}
4019     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4020     } else {
4021     $self->{set_nc}->($self);
4022     }
4023    
4024     redo A;
4025     } elsif (0x0061 <= $self->{nc} and
4026     $self->{nc} <= 0x0066) { # a..f
4027    
4028     $self->{s_kwd} *= 0x10;
4029     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4030     ## Stay in the state.
4031    
4032     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4033     $self->{line_prev} = $self->{line};
4034     $self->{column_prev} = $self->{column};
4035     $self->{column}++;
4036     $self->{nc}
4037     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4038     } else {
4039     $self->{set_nc}->($self);
4040     }
4041    
4042     redo A;
4043     } elsif (0x0041 <= $self->{nc} and
4044     $self->{nc} <= 0x0046) { # A..F
4045    
4046     $self->{s_kwd} *= 0x10;
4047     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4048     ## Stay in the state.
4049    
4050     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4051     $self->{line_prev} = $self->{line};
4052     $self->{column_prev} = $self->{column};
4053     $self->{column}++;
4054     $self->{nc}
4055     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4056     } else {
4057     $self->{set_nc}->($self);
4058     }
4059    
4060     redo A;
4061     } elsif ($self->{nc} == 0x003B) { # ;
4062    
4063    
4064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4065     $self->{line_prev} = $self->{line};
4066     $self->{column_prev} = $self->{column};
4067     $self->{column}++;
4068     $self->{nc}
4069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4070     } else {
4071     $self->{set_nc}->($self);
4072     }
4073    
4074     #
4075     } else {
4076    
4077     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4078     line => $self->{line},
4079     column => $self->{column});
4080     ## Reconsume.
4081     #
4082     }
4083    
4084     my $code = $self->{s_kwd};
4085     my $l = $self->{line_prev};
4086     my $c = $self->{column_prev};
4087     if ($charref_map->{$code}) {
4088    
4089     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4090     text => (sprintf 'U+%04X', $code),
4091     line => $l, column => $c);
4092     $code = $charref_map->{$code};
4093     } elsif ($code > 0x10FFFF) {
4094    
4095     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4096     text => (sprintf 'U-%08X', $code),
4097     line => $l, column => $c);
4098     $code = 0xFFFD;
4099     }
4100    
4101     if ($self->{prev_state} == DATA_STATE) {
4102    
4103     $self->{state} = $self->{prev_state};
4104 wakaba 1.5 $self->{s_kwd} = '';
4105 wakaba 1.1 ## Reconsume.
4106     return ({type => CHARACTER_TOKEN, data => chr $code,
4107     line => $l, column => $c,
4108     });
4109     redo A;
4110     } else {
4111    
4112     $self->{ca}->{value} .= chr $code;
4113     $self->{ca}->{has_reference} = 1;
4114     $self->{state} = $self->{prev_state};
4115 wakaba 1.5 $self->{s_kwd} = '';
4116 wakaba 1.1 ## Reconsume.
4117     redo A;
4118     }
4119     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4120     if (length $self->{s_kwd} < 30 and
4121     ## NOTE: Some number greater than the maximum length of entity name
4122     ((0x0041 <= $self->{nc} and # a
4123     $self->{nc} <= 0x005A) or # x
4124     (0x0061 <= $self->{nc} and # a
4125     $self->{nc} <= 0x007A) or # z
4126     (0x0030 <= $self->{nc} and # 0
4127     $self->{nc} <= 0x0039) or # 9
4128     $self->{nc} == 0x003B)) { # ;
4129     our $EntityChar;
4130     $self->{s_kwd} .= chr $self->{nc};
4131     if (defined $EntityChar->{$self->{s_kwd}}) {
4132     if ($self->{nc} == 0x003B) { # ;
4133    
4134     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4135     $self->{entity__match} = 1;
4136    
4137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4138     $self->{line_prev} = $self->{line};
4139     $self->{column_prev} = $self->{column};
4140     $self->{column}++;
4141     $self->{nc}
4142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4143     } else {
4144     $self->{set_nc}->($self);
4145     }
4146    
4147     #
4148     } else {
4149    
4150     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4151     $self->{entity__match} = -1;
4152     ## Stay in the state.
4153    
4154     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4155     $self->{line_prev} = $self->{line};
4156     $self->{column_prev} = $self->{column};
4157     $self->{column}++;
4158     $self->{nc}
4159     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4160     } else {
4161     $self->{set_nc}->($self);
4162     }
4163    
4164     redo A;
4165     }
4166     } else {
4167    
4168     $self->{entity__value} .= chr $self->{nc};
4169     $self->{entity__match} *= 2;
4170     ## Stay in the state.
4171    
4172     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4173     $self->{line_prev} = $self->{line};
4174     $self->{column_prev} = $self->{column};
4175     $self->{column}++;
4176     $self->{nc}
4177     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4178     } else {
4179     $self->{set_nc}->($self);
4180     }
4181    
4182     redo A;
4183     }
4184     }
4185    
4186     my $data;
4187     my $has_ref;
4188     if ($self->{entity__match} > 0) {
4189    
4190     $data = $self->{entity__value};
4191     $has_ref = 1;
4192     #
4193     } elsif ($self->{entity__match} < 0) {
4194     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4195     if ($self->{prev_state} != DATA_STATE and # in attribute
4196     $self->{entity__match} < -1) {
4197    
4198     $data = '&' . $self->{s_kwd};
4199     #
4200     } else {
4201    
4202     $data = $self->{entity__value};
4203     $has_ref = 1;
4204     #
4205     }
4206     } else {
4207    
4208     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4209     line => $self->{line_prev},
4210     column => $self->{column_prev} - length $self->{s_kwd});
4211     $data = '&' . $self->{s_kwd};
4212     #
4213     }
4214    
4215     ## NOTE: In these cases, when a character reference is found,
4216     ## it is consumed and a character token is returned, or, otherwise,
4217     ## nothing is consumed and returned, according to the spec algorithm.
4218     ## In this implementation, anything that has been examined by the
4219     ## tokenizer is appended to the parent element or the attribute value
4220     ## as string, either literal string when no character reference or
4221     ## entity-replaced string otherwise, in this stage, since any characters
4222     ## that would not be consumed are appended in the data state or in an
4223     ## appropriate attribute value state anyway.
4224    
4225     if ($self->{prev_state} == DATA_STATE) {
4226    
4227     $self->{state} = $self->{prev_state};
4228 wakaba 1.5 $self->{s_kwd} = '';
4229 wakaba 1.1 ## Reconsume.
4230     return ({type => CHARACTER_TOKEN,
4231     data => $data,
4232     line => $self->{line_prev},
4233     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4234     });
4235     redo A;
4236     } else {
4237    
4238     $self->{ca}->{value} .= $data;
4239     $self->{ca}->{has_reference} = 1 if $has_ref;
4240     $self->{state} = $self->{prev_state};
4241 wakaba 1.5 $self->{s_kwd} = '';
4242 wakaba 1.1 ## Reconsume.
4243     redo A;
4244     }
4245     } else {
4246     die "$0: $self->{state}: Unknown state";
4247     }
4248     } # A
4249    
4250     die "$0: _get_next_token: unexpected case";
4251     } # _get_next_token
4252    
4253     1;
4254 wakaba 1.5 ## $Date: 2008/10/14 11:46:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24