/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download)
Tue Oct 14 15:25:50 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.6: +6 -2 lines
++ whatpm/t/ChangeLog	14 Oct 2008 15:23:30 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/charref-1.dat" added.

++ whatpm/t/xml/ChangeLog	14 Oct 2008 15:23:49 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* charref-1.dat: New test data file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 15:24:42 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: Mark CHARACTER_TOKEN with character reference
	as such, for the support of XML parse error.

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 15:25:35 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Raise a parse error for white space character
	generated by a character reference outside of the root element.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.7 our $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
182 wakaba 1.1 #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194    
195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196     $self->{line_prev} = $self->{line};
197     $self->{column_prev} = $self->{column};
198     $self->{column}++;
199     $self->{nc}
200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201     } else {
202     $self->{set_nc}->($self);
203     }
204    
205     $self->{token} = [];
206     # $self->{escape}
207     } # _initialize_tokenizer
208    
209     ## A token has:
210     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212     ## ->{name} (DOCTYPE_TOKEN)
213     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214     ## ->{pubid} (DOCTYPE_TOKEN)
215     ## ->{sysid} (DOCTYPE_TOKEN)
216     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218     ## ->{name}
219     ## ->{value}
220     ## ->{has_reference} == 1 or 0
221     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
224     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
225     ## while the token is pushed back to the stack.
226    
227     ## Emitted token MUST immediately be handled by the tree construction state.
228    
229     ## Before each step, UA MAY check to see if either one of the scripts in
230     ## "list of scripts that will execute as soon as possible" or the first
231     ## script in the "list of scripts that will execute asynchronously",
232     ## has completed loading. If one has, then it MUST be executed
233     ## and removed from the list.
234    
235     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
236     ## (This requirement was dropped from HTML5 spec, unfortunately.)
237    
238     my $is_space = {
239     0x0009 => 1, # CHARACTER TABULATION (HT)
240     0x000A => 1, # LINE FEED (LF)
241     #0x000B => 0, # LINE TABULATION (VT)
242     0x000C => 1, # FORM FEED (FF)
243     #0x000D => 1, # CARRIAGE RETURN (CR)
244     0x0020 => 1, # SPACE (SP)
245     };
246    
247     sub _get_next_token ($) {
248     my $self = shift;
249    
250     if ($self->{self_closing}) {
251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
252     ## NOTE: The |self_closing| flag is only set by start tag token.
253     ## In addition, when a start tag token is emitted, it is always set to
254     ## |ct|.
255     delete $self->{self_closing};
256     }
257    
258     if (@{$self->{token}}) {
259     $self->{self_closing} = $self->{token}->[0]->{self_closing};
260     return shift @{$self->{token}};
261     }
262    
263     A: {
264     if ($self->{state} == PCDATA_STATE) {
265     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
266    
267     if ($self->{nc} == 0x0026) { # &
268    
269     ## NOTE: In the spec, the tokenizer is switched to the
270     ## "entity data state". In this implementation, the tokenizer
271     ## is switched to the |ENTITY_STATE|, which is an implementation
272     ## of the "consume a character reference" algorithm.
273     $self->{entity_add} = -1;
274     $self->{prev_state} = DATA_STATE;
275     $self->{state} = ENTITY_STATE;
276    
277     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278     $self->{line_prev} = $self->{line};
279     $self->{column_prev} = $self->{column};
280     $self->{column}++;
281     $self->{nc}
282     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283     } else {
284     $self->{set_nc}->($self);
285     }
286    
287     redo A;
288     } elsif ($self->{nc} == 0x003C) { # <
289    
290     $self->{state} = TAG_OPEN_STATE;
291    
292     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
293     $self->{line_prev} = $self->{line};
294     $self->{column_prev} = $self->{column};
295     $self->{column}++;
296     $self->{nc}
297     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
298     } else {
299     $self->{set_nc}->($self);
300     }
301    
302     redo A;
303     } elsif ($self->{nc} == -1) {
304    
305     return ({type => END_OF_FILE_TOKEN,
306     line => $self->{line}, column => $self->{column}});
307     last A; ## TODO: ok?
308     } else {
309    
310     #
311     }
312    
313     # Anything else
314     my $token = {type => CHARACTER_TOKEN,
315     data => chr $self->{nc},
316     line => $self->{line}, column => $self->{column},
317     };
318     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
319    
320     ## Stay in the state.
321    
322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
323     $self->{line_prev} = $self->{line};
324     $self->{column_prev} = $self->{column};
325     $self->{column}++;
326     $self->{nc}
327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
328     } else {
329     $self->{set_nc}->($self);
330     }
331    
332     return ($token);
333     redo A;
334     } elsif ($self->{state} == DATA_STATE) {
335     $self->{s_kwd} = '' unless defined $self->{s_kwd};
336     if ($self->{nc} == 0x0026) { # &
337     $self->{s_kwd} = '';
338     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
339     not $self->{escape}) {
340    
341     ## NOTE: In the spec, the tokenizer is switched to the
342     ## "entity data state". In this implementation, the tokenizer
343     ## is switched to the |ENTITY_STATE|, which is an implementation
344     ## of the "consume a character reference" algorithm.
345     $self->{entity_add} = -1;
346     $self->{prev_state} = DATA_STATE;
347     $self->{state} = ENTITY_STATE;
348    
349     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
350     $self->{line_prev} = $self->{line};
351     $self->{column_prev} = $self->{column};
352     $self->{column}++;
353     $self->{nc}
354     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
355     } else {
356     $self->{set_nc}->($self);
357     }
358    
359     redo A;
360     } else {
361    
362     #
363     }
364     } elsif ($self->{nc} == 0x002D) { # -
365     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
366 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
367 wakaba 1.1
368     $self->{escape} = 1; # unless $self->{escape};
369     $self->{s_kwd} = '--';
370     #
371 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
372 wakaba 1.1
373     $self->{s_kwd} = '--';
374     #
375 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
376    
377     $self->{s_kwd} .= '-';
378     #
379 wakaba 1.1 } else {
380    
381 wakaba 1.5 $self->{s_kwd} = '-';
382 wakaba 1.1 #
383     }
384     }
385    
386     #
387     } elsif ($self->{nc} == 0x0021) { # !
388     if (length $self->{s_kwd}) {
389    
390     $self->{s_kwd} .= '!';
391     #
392     } else {
393    
394     #$self->{s_kwd} = '';
395     #
396     }
397     #
398     } elsif ($self->{nc} == 0x003C) { # <
399     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
400     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
401     not $self->{escape})) {
402    
403     $self->{state} = TAG_OPEN_STATE;
404    
405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
406     $self->{line_prev} = $self->{line};
407     $self->{column_prev} = $self->{column};
408     $self->{column}++;
409     $self->{nc}
410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
411     } else {
412     $self->{set_nc}->($self);
413     }
414    
415     redo A;
416     } else {
417    
418     $self->{s_kwd} = '';
419     #
420     }
421     } elsif ($self->{nc} == 0x003E) { # >
422     if ($self->{escape} and
423     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
424     if ($self->{s_kwd} eq '--') {
425    
426     delete $self->{escape};
427 wakaba 1.5 #
428 wakaba 1.1 } else {
429    
430 wakaba 1.5 #
431 wakaba 1.1 }
432 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
433    
434     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
435     line => $self->{line_prev},
436     column => $self->{column_prev} - 1);
437     #
438 wakaba 1.1 } else {
439    
440 wakaba 1.5 #
441 wakaba 1.1 }
442    
443     $self->{s_kwd} = '';
444     #
445 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
446     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
447    
448     $self->{s_kwd} .= ']';
449     } elsif ($self->{s_kwd} eq ']]') {
450    
451     #
452     } else {
453    
454     $self->{s_kwd} = '';
455     }
456     #
457 wakaba 1.1 } elsif ($self->{nc} == -1) {
458    
459     $self->{s_kwd} = '';
460     return ({type => END_OF_FILE_TOKEN,
461     line => $self->{line}, column => $self->{column}});
462     last A; ## TODO: ok?
463     } else {
464    
465     $self->{s_kwd} = '';
466     #
467     }
468    
469     # Anything else
470     my $token = {type => CHARACTER_TOKEN,
471     data => chr $self->{nc},
472     line => $self->{line}, column => $self->{column},
473     };
474 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
475 wakaba 1.1 length $token->{data})) {
476     $self->{s_kwd} = '';
477     }
478    
479     ## Stay in the data state.
480 wakaba 1.5 if (not $self->{is_xml} and
481     $self->{content_model} == PCDATA_CONTENT_MODEL) {
482 wakaba 1.1
483     $self->{state} = PCDATA_STATE;
484     } else {
485    
486     ## Stay in the state.
487     }
488    
489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
490     $self->{line_prev} = $self->{line};
491     $self->{column_prev} = $self->{column};
492     $self->{column}++;
493     $self->{nc}
494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
495     } else {
496     $self->{set_nc}->($self);
497     }
498    
499     return ($token);
500     redo A;
501     } elsif ($self->{state} == TAG_OPEN_STATE) {
502     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
503     if ($self->{nc} == 0x002F) { # /
504    
505    
506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
507     $self->{line_prev} = $self->{line};
508     $self->{column_prev} = $self->{column};
509     $self->{column}++;
510     $self->{nc}
511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
512     } else {
513     $self->{set_nc}->($self);
514     }
515    
516     $self->{state} = CLOSE_TAG_OPEN_STATE;
517     redo A;
518     } elsif ($self->{nc} == 0x0021) { # !
519    
520     $self->{s_kwd} = '<' unless $self->{escape};
521     #
522     } else {
523    
524     #
525     }
526    
527     ## reconsume
528     $self->{state} = DATA_STATE;
529 wakaba 1.5 $self->{s_kwd} = '';
530 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
531     line => $self->{line_prev},
532     column => $self->{column_prev},
533     });
534     redo A;
535     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
536     if ($self->{nc} == 0x0021) { # !
537    
538     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
539    
540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
541     $self->{line_prev} = $self->{line};
542     $self->{column_prev} = $self->{column};
543     $self->{column}++;
544     $self->{nc}
545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
546     } else {
547     $self->{set_nc}->($self);
548     }
549    
550     redo A;
551     } elsif ($self->{nc} == 0x002F) { # /
552    
553     $self->{state} = CLOSE_TAG_OPEN_STATE;
554    
555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
556     $self->{line_prev} = $self->{line};
557     $self->{column_prev} = $self->{column};
558     $self->{column}++;
559     $self->{nc}
560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
561     } else {
562     $self->{set_nc}->($self);
563     }
564    
565     redo A;
566     } elsif (0x0041 <= $self->{nc} and
567     $self->{nc} <= 0x005A) { # A..Z
568    
569     $self->{ct}
570     = {type => START_TAG_TOKEN,
571 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
572 wakaba 1.1 line => $self->{line_prev},
573     column => $self->{column_prev}};
574     $self->{state} = TAG_NAME_STATE;
575    
576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577     $self->{line_prev} = $self->{line};
578     $self->{column_prev} = $self->{column};
579     $self->{column}++;
580     $self->{nc}
581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582     } else {
583     $self->{set_nc}->($self);
584     }
585    
586     redo A;
587     } elsif (0x0061 <= $self->{nc} and
588     $self->{nc} <= 0x007A) { # a..z
589    
590     $self->{ct} = {type => START_TAG_TOKEN,
591     tag_name => chr ($self->{nc}),
592     line => $self->{line_prev},
593     column => $self->{column_prev}};
594     $self->{state} = TAG_NAME_STATE;
595    
596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
597     $self->{line_prev} = $self->{line};
598     $self->{column_prev} = $self->{column};
599     $self->{column}++;
600     $self->{nc}
601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
602     } else {
603     $self->{set_nc}->($self);
604     }
605    
606     redo A;
607     } elsif ($self->{nc} == 0x003E) { # >
608    
609     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
610     line => $self->{line_prev},
611     column => $self->{column_prev});
612     $self->{state} = DATA_STATE;
613 wakaba 1.5 $self->{s_kwd} = '';
614 wakaba 1.1
615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
616     $self->{line_prev} = $self->{line};
617     $self->{column_prev} = $self->{column};
618     $self->{column}++;
619     $self->{nc}
620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
621     } else {
622     $self->{set_nc}->($self);
623     }
624    
625    
626     return ({type => CHARACTER_TOKEN, data => '<>',
627     line => $self->{line_prev},
628     column => $self->{column_prev},
629     });
630    
631     redo A;
632     } elsif ($self->{nc} == 0x003F) { # ?
633    
634     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
635     line => $self->{line_prev},
636     column => $self->{column_prev});
637     $self->{state} = BOGUS_COMMENT_STATE;
638     $self->{ct} = {type => COMMENT_TOKEN, data => '',
639     line => $self->{line_prev},
640     column => $self->{column_prev},
641     };
642     ## $self->{nc} is intentionally left as is
643     redo A;
644     } else {
645    
646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
647     line => $self->{line_prev},
648     column => $self->{column_prev});
649     $self->{state} = DATA_STATE;
650 wakaba 1.5 $self->{s_kwd} = '';
651 wakaba 1.1 ## reconsume
652    
653     return ({type => CHARACTER_TOKEN, data => '<',
654     line => $self->{line_prev},
655     column => $self->{column_prev},
656     });
657    
658     redo A;
659     }
660     } else {
661     die "$0: $self->{content_model} in tag open";
662     }
663     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
664     ## NOTE: The "close tag open state" in the spec is implemented as
665     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
666    
667     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
668     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
669     if (defined $self->{last_stag_name}) {
670     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
671     $self->{s_kwd} = '';
672     ## Reconsume.
673     redo A;
674     } else {
675     ## No start tag token has ever been emitted
676     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
677    
678     $self->{state} = DATA_STATE;
679 wakaba 1.5 $self->{s_kwd} = '';
680 wakaba 1.1 ## Reconsume.
681     return ({type => CHARACTER_TOKEN, data => '</',
682     line => $l, column => $c,
683     });
684     redo A;
685     }
686     }
687    
688     if (0x0041 <= $self->{nc} and
689     $self->{nc} <= 0x005A) { # A..Z
690    
691     $self->{ct}
692     = {type => END_TAG_TOKEN,
693 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
694 wakaba 1.1 line => $l, column => $c};
695     $self->{state} = TAG_NAME_STATE;
696    
697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
698     $self->{line_prev} = $self->{line};
699     $self->{column_prev} = $self->{column};
700     $self->{column}++;
701     $self->{nc}
702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
703     } else {
704     $self->{set_nc}->($self);
705     }
706    
707     redo A;
708     } elsif (0x0061 <= $self->{nc} and
709     $self->{nc} <= 0x007A) { # a..z
710    
711     $self->{ct} = {type => END_TAG_TOKEN,
712     tag_name => chr ($self->{nc}),
713     line => $l, column => $c};
714     $self->{state} = TAG_NAME_STATE;
715    
716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
717     $self->{line_prev} = $self->{line};
718     $self->{column_prev} = $self->{column};
719     $self->{column}++;
720     $self->{nc}
721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
722     } else {
723     $self->{set_nc}->($self);
724     }
725    
726     redo A;
727     } elsif ($self->{nc} == 0x003E) { # >
728    
729     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
730     line => $self->{line_prev}, ## "<" in "</>"
731     column => $self->{column_prev} - 1);
732     $self->{state} = DATA_STATE;
733 wakaba 1.5 $self->{s_kwd} = '';
734 wakaba 1.1
735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
736     $self->{line_prev} = $self->{line};
737     $self->{column_prev} = $self->{column};
738     $self->{column}++;
739     $self->{nc}
740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
741     } else {
742     $self->{set_nc}->($self);
743     }
744    
745     redo A;
746     } elsif ($self->{nc} == -1) {
747    
748     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
749 wakaba 1.5 $self->{s_kwd} = '';
750 wakaba 1.1 $self->{state} = DATA_STATE;
751     # reconsume
752    
753     return ({type => CHARACTER_TOKEN, data => '</',
754     line => $l, column => $c,
755     });
756    
757     redo A;
758     } else {
759    
760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
761     $self->{state} = BOGUS_COMMENT_STATE;
762     $self->{ct} = {type => COMMENT_TOKEN, data => '',
763     line => $self->{line_prev}, # "<" of "</"
764     column => $self->{column_prev} - 1,
765     };
766     ## NOTE: $self->{nc} is intentionally left as is.
767     ## Although the "anything else" case of the spec not explicitly
768     ## states that the next input character is to be reconsumed,
769     ## it will be included to the |data| of the comment token
770     ## generated from the bogus end tag, as defined in the
771     ## "bogus comment state" entry.
772     redo A;
773     }
774     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
775     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
776     if (length $ch) {
777     my $CH = $ch;
778     $ch =~ tr/a-z/A-Z/;
779     my $nch = chr $self->{nc};
780     if ($nch eq $ch or $nch eq $CH) {
781    
782     ## Stay in the state.
783     $self->{s_kwd} .= $nch;
784    
785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
786     $self->{line_prev} = $self->{line};
787     $self->{column_prev} = $self->{column};
788     $self->{column}++;
789     $self->{nc}
790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
791     } else {
792     $self->{set_nc}->($self);
793     }
794    
795     redo A;
796     } else {
797    
798     $self->{state} = DATA_STATE;
799 wakaba 1.5 $self->{s_kwd} = '';
800 wakaba 1.1 ## Reconsume.
801     return ({type => CHARACTER_TOKEN,
802     data => '</' . $self->{s_kwd},
803     line => $self->{line_prev},
804     column => $self->{column_prev} - 1 - length $self->{s_kwd},
805     });
806     redo A;
807     }
808     } else { # after "<{tag-name}"
809     unless ($is_space->{$self->{nc}} or
810     {
811     0x003E => 1, # >
812     0x002F => 1, # /
813     -1 => 1, # EOF
814     }->{$self->{nc}}) {
815    
816     ## Reconsume.
817     $self->{state} = DATA_STATE;
818 wakaba 1.5 $self->{s_kwd} = '';
819 wakaba 1.1 return ({type => CHARACTER_TOKEN,
820     data => '</' . $self->{s_kwd},
821     line => $self->{line_prev},
822     column => $self->{column_prev} - 1 - length $self->{s_kwd},
823     });
824     redo A;
825     } else {
826    
827     $self->{ct}
828     = {type => END_TAG_TOKEN,
829     tag_name => $self->{last_stag_name},
830     line => $self->{line_prev},
831     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
832     $self->{state} = TAG_NAME_STATE;
833     ## Reconsume.
834     redo A;
835     }
836     }
837     } elsif ($self->{state} == TAG_NAME_STATE) {
838     if ($is_space->{$self->{nc}}) {
839    
840     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
841    
842     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
843     $self->{line_prev} = $self->{line};
844     $self->{column_prev} = $self->{column};
845     $self->{column}++;
846     $self->{nc}
847     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
848     } else {
849     $self->{set_nc}->($self);
850     }
851    
852     redo A;
853     } elsif ($self->{nc} == 0x003E) { # >
854     if ($self->{ct}->{type} == START_TAG_TOKEN) {
855    
856     $self->{last_stag_name} = $self->{ct}->{tag_name};
857     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
858     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
859     #if ($self->{ct}->{attributes}) {
860     # ## NOTE: This should never be reached.
861     # !!! cp (36);
862     # !!! parse-error (type => 'end tag attribute');
863     #} else {
864    
865     #}
866     } else {
867     die "$0: $self->{ct}->{type}: Unknown token type";
868     }
869     $self->{state} = DATA_STATE;
870 wakaba 1.5 $self->{s_kwd} = '';
871 wakaba 1.1
872     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
873     $self->{line_prev} = $self->{line};
874     $self->{column_prev} = $self->{column};
875     $self->{column}++;
876     $self->{nc}
877     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
878     } else {
879     $self->{set_nc}->($self);
880     }
881    
882    
883     return ($self->{ct}); # start tag or end tag
884    
885     redo A;
886     } elsif (0x0041 <= $self->{nc} and
887     $self->{nc} <= 0x005A) { # A..Z
888    
889 wakaba 1.4 $self->{ct}->{tag_name}
890     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
891 wakaba 1.1 # start tag or end tag
892     ## Stay in this state
893    
894     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
895     $self->{line_prev} = $self->{line};
896     $self->{column_prev} = $self->{column};
897     $self->{column}++;
898     $self->{nc}
899     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
900     } else {
901     $self->{set_nc}->($self);
902     }
903    
904     redo A;
905     } elsif ($self->{nc} == -1) {
906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
907     if ($self->{ct}->{type} == START_TAG_TOKEN) {
908    
909     $self->{last_stag_name} = $self->{ct}->{tag_name};
910     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
911     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
912     #if ($self->{ct}->{attributes}) {
913     # ## NOTE: This state should never be reached.
914     # !!! cp (40);
915     # !!! parse-error (type => 'end tag attribute');
916     #} else {
917    
918     #}
919     } else {
920     die "$0: $self->{ct}->{type}: Unknown token type";
921     }
922     $self->{state} = DATA_STATE;
923 wakaba 1.5 $self->{s_kwd} = '';
924 wakaba 1.1 # reconsume
925    
926     return ($self->{ct}); # start tag or end tag
927    
928     redo A;
929     } elsif ($self->{nc} == 0x002F) { # /
930    
931     $self->{state} = SELF_CLOSING_START_TAG_STATE;
932    
933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
934     $self->{line_prev} = $self->{line};
935     $self->{column_prev} = $self->{column};
936     $self->{column}++;
937     $self->{nc}
938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
939     } else {
940     $self->{set_nc}->($self);
941     }
942    
943     redo A;
944     } else {
945    
946     $self->{ct}->{tag_name} .= chr $self->{nc};
947     # start tag or end tag
948     ## Stay in the state
949    
950     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
951     $self->{line_prev} = $self->{line};
952     $self->{column_prev} = $self->{column};
953     $self->{column}++;
954     $self->{nc}
955     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
956     } else {
957     $self->{set_nc}->($self);
958     }
959    
960     redo A;
961     }
962     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
963     if ($is_space->{$self->{nc}}) {
964    
965     ## Stay in the state
966    
967     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
968     $self->{line_prev} = $self->{line};
969     $self->{column_prev} = $self->{column};
970     $self->{column}++;
971     $self->{nc}
972     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
973     } else {
974     $self->{set_nc}->($self);
975     }
976    
977     redo A;
978     } elsif ($self->{nc} == 0x003E) { # >
979     if ($self->{ct}->{type} == START_TAG_TOKEN) {
980    
981     $self->{last_stag_name} = $self->{ct}->{tag_name};
982     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
983     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
984     if ($self->{ct}->{attributes}) {
985    
986     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
987     } else {
988    
989     }
990     } else {
991     die "$0: $self->{ct}->{type}: Unknown token type";
992     }
993     $self->{state} = DATA_STATE;
994 wakaba 1.5 $self->{s_kwd} = '';
995 wakaba 1.1
996     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
997     $self->{line_prev} = $self->{line};
998     $self->{column_prev} = $self->{column};
999     $self->{column}++;
1000     $self->{nc}
1001     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1002     } else {
1003     $self->{set_nc}->($self);
1004     }
1005    
1006    
1007     return ($self->{ct}); # start tag or end tag
1008    
1009     redo A;
1010     } elsif (0x0041 <= $self->{nc} and
1011     $self->{nc} <= 0x005A) { # A..Z
1012    
1013     $self->{ca}
1014 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1015 wakaba 1.1 value => '',
1016     line => $self->{line}, column => $self->{column}};
1017     $self->{state} = ATTRIBUTE_NAME_STATE;
1018    
1019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020     $self->{line_prev} = $self->{line};
1021     $self->{column_prev} = $self->{column};
1022     $self->{column}++;
1023     $self->{nc}
1024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025     } else {
1026     $self->{set_nc}->($self);
1027     }
1028    
1029     redo A;
1030     } elsif ($self->{nc} == 0x002F) { # /
1031    
1032     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1033    
1034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1035     $self->{line_prev} = $self->{line};
1036     $self->{column_prev} = $self->{column};
1037     $self->{column}++;
1038     $self->{nc}
1039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1040     } else {
1041     $self->{set_nc}->($self);
1042     }
1043    
1044     redo A;
1045     } elsif ($self->{nc} == -1) {
1046     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1047     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1048    
1049     $self->{last_stag_name} = $self->{ct}->{tag_name};
1050     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1051     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1052     if ($self->{ct}->{attributes}) {
1053    
1054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1055     } else {
1056    
1057     }
1058     } else {
1059     die "$0: $self->{ct}->{type}: Unknown token type";
1060     }
1061     $self->{state} = DATA_STATE;
1062 wakaba 1.5 $self->{s_kwd} = '';
1063 wakaba 1.1 # reconsume
1064    
1065     return ($self->{ct}); # start tag or end tag
1066    
1067     redo A;
1068     } else {
1069     if ({
1070     0x0022 => 1, # "
1071     0x0027 => 1, # '
1072     0x003D => 1, # =
1073     }->{$self->{nc}}) {
1074    
1075     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1076     } else {
1077    
1078     }
1079     $self->{ca}
1080     = {name => chr ($self->{nc}),
1081     value => '',
1082     line => $self->{line}, column => $self->{column}};
1083     $self->{state} = ATTRIBUTE_NAME_STATE;
1084    
1085     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1086     $self->{line_prev} = $self->{line};
1087     $self->{column_prev} = $self->{column};
1088     $self->{column}++;
1089     $self->{nc}
1090     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1091     } else {
1092     $self->{set_nc}->($self);
1093     }
1094    
1095     redo A;
1096     }
1097     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1098     my $before_leave = sub {
1099     if (exists $self->{ct}->{attributes} # start tag or end tag
1100     ->{$self->{ca}->{name}}) { # MUST
1101    
1102     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1103     ## Discard $self->{ca} # MUST
1104     } else {
1105    
1106     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1107     = $self->{ca};
1108     }
1109     }; # $before_leave
1110    
1111     if ($is_space->{$self->{nc}}) {
1112    
1113     $before_leave->();
1114     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1115    
1116     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1117     $self->{line_prev} = $self->{line};
1118     $self->{column_prev} = $self->{column};
1119     $self->{column}++;
1120     $self->{nc}
1121     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1122     } else {
1123     $self->{set_nc}->($self);
1124     }
1125    
1126     redo A;
1127     } elsif ($self->{nc} == 0x003D) { # =
1128    
1129     $before_leave->();
1130     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1131    
1132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1133     $self->{line_prev} = $self->{line};
1134     $self->{column_prev} = $self->{column};
1135     $self->{column}++;
1136     $self->{nc}
1137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1138     } else {
1139     $self->{set_nc}->($self);
1140     }
1141    
1142     redo A;
1143     } elsif ($self->{nc} == 0x003E) { # >
1144     $before_leave->();
1145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1146    
1147     $self->{last_stag_name} = $self->{ct}->{tag_name};
1148     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1149    
1150     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1151     if ($self->{ct}->{attributes}) {
1152     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1153     }
1154     } else {
1155     die "$0: $self->{ct}->{type}: Unknown token type";
1156     }
1157     $self->{state} = DATA_STATE;
1158 wakaba 1.5 $self->{s_kwd} = '';
1159 wakaba 1.1
1160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1161     $self->{line_prev} = $self->{line};
1162     $self->{column_prev} = $self->{column};
1163     $self->{column}++;
1164     $self->{nc}
1165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1166     } else {
1167     $self->{set_nc}->($self);
1168     }
1169    
1170    
1171     return ($self->{ct}); # start tag or end tag
1172    
1173     redo A;
1174     } elsif (0x0041 <= $self->{nc} and
1175     $self->{nc} <= 0x005A) { # A..Z
1176    
1177 wakaba 1.4 $self->{ca}->{name}
1178     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1179 wakaba 1.1 ## Stay in the state
1180    
1181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1182     $self->{line_prev} = $self->{line};
1183     $self->{column_prev} = $self->{column};
1184     $self->{column}++;
1185     $self->{nc}
1186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1187     } else {
1188     $self->{set_nc}->($self);
1189     }
1190    
1191     redo A;
1192     } elsif ($self->{nc} == 0x002F) { # /
1193    
1194     $before_leave->();
1195     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1196    
1197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1198     $self->{line_prev} = $self->{line};
1199     $self->{column_prev} = $self->{column};
1200     $self->{column}++;
1201     $self->{nc}
1202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1203     } else {
1204     $self->{set_nc}->($self);
1205     }
1206    
1207     redo A;
1208     } elsif ($self->{nc} == -1) {
1209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1210     $before_leave->();
1211     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1212    
1213     $self->{last_stag_name} = $self->{ct}->{tag_name};
1214     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1215     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1216     if ($self->{ct}->{attributes}) {
1217    
1218     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1219     } else {
1220     ## NOTE: This state should never be reached.
1221    
1222     }
1223     } else {
1224     die "$0: $self->{ct}->{type}: Unknown token type";
1225     }
1226     $self->{state} = DATA_STATE;
1227 wakaba 1.5 $self->{s_kwd} = '';
1228 wakaba 1.1 # reconsume
1229    
1230     return ($self->{ct}); # start tag or end tag
1231    
1232     redo A;
1233     } else {
1234     if ($self->{nc} == 0x0022 or # "
1235     $self->{nc} == 0x0027) { # '
1236    
1237     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1238     } else {
1239    
1240     }
1241     $self->{ca}->{name} .= chr ($self->{nc});
1242     ## Stay in the state
1243    
1244     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1245     $self->{line_prev} = $self->{line};
1246     $self->{column_prev} = $self->{column};
1247     $self->{column}++;
1248     $self->{nc}
1249     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1250     } else {
1251     $self->{set_nc}->($self);
1252     }
1253    
1254     redo A;
1255     }
1256     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1257     if ($is_space->{$self->{nc}}) {
1258    
1259     ## Stay in the state
1260    
1261     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1262     $self->{line_prev} = $self->{line};
1263     $self->{column_prev} = $self->{column};
1264     $self->{column}++;
1265     $self->{nc}
1266     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1267     } else {
1268     $self->{set_nc}->($self);
1269     }
1270    
1271     redo A;
1272     } elsif ($self->{nc} == 0x003D) { # =
1273    
1274     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1275    
1276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1277     $self->{line_prev} = $self->{line};
1278     $self->{column_prev} = $self->{column};
1279     $self->{column}++;
1280     $self->{nc}
1281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1282     } else {
1283     $self->{set_nc}->($self);
1284     }
1285    
1286     redo A;
1287     } elsif ($self->{nc} == 0x003E) { # >
1288     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1289    
1290     $self->{last_stag_name} = $self->{ct}->{tag_name};
1291     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1292     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1293     if ($self->{ct}->{attributes}) {
1294    
1295     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1296     } else {
1297     ## NOTE: This state should never be reached.
1298    
1299     }
1300     } else {
1301     die "$0: $self->{ct}->{type}: Unknown token type";
1302     }
1303     $self->{state} = DATA_STATE;
1304 wakaba 1.5 $self->{s_kwd} = '';
1305 wakaba 1.1
1306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307     $self->{line_prev} = $self->{line};
1308     $self->{column_prev} = $self->{column};
1309     $self->{column}++;
1310     $self->{nc}
1311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312     } else {
1313     $self->{set_nc}->($self);
1314     }
1315    
1316    
1317     return ($self->{ct}); # start tag or end tag
1318    
1319     redo A;
1320     } elsif (0x0041 <= $self->{nc} and
1321     $self->{nc} <= 0x005A) { # A..Z
1322    
1323     $self->{ca}
1324 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1325 wakaba 1.1 value => '',
1326     line => $self->{line}, column => $self->{column}};
1327     $self->{state} = ATTRIBUTE_NAME_STATE;
1328    
1329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1330     $self->{line_prev} = $self->{line};
1331     $self->{column_prev} = $self->{column};
1332     $self->{column}++;
1333     $self->{nc}
1334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1335     } else {
1336     $self->{set_nc}->($self);
1337     }
1338    
1339     redo A;
1340     } elsif ($self->{nc} == 0x002F) { # /
1341    
1342     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1343    
1344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1345     $self->{line_prev} = $self->{line};
1346     $self->{column_prev} = $self->{column};
1347     $self->{column}++;
1348     $self->{nc}
1349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1350     } else {
1351     $self->{set_nc}->($self);
1352     }
1353    
1354     redo A;
1355     } elsif ($self->{nc} == -1) {
1356     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1357     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1358    
1359     $self->{last_stag_name} = $self->{ct}->{tag_name};
1360     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1361     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1362     if ($self->{ct}->{attributes}) {
1363    
1364     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1365     } else {
1366     ## NOTE: This state should never be reached.
1367    
1368     }
1369     } else {
1370     die "$0: $self->{ct}->{type}: Unknown token type";
1371     }
1372 wakaba 1.5 $self->{s_kwd} = '';
1373 wakaba 1.1 $self->{state} = DATA_STATE;
1374     # reconsume
1375    
1376     return ($self->{ct}); # start tag or end tag
1377    
1378     redo A;
1379     } else {
1380     if ($self->{nc} == 0x0022 or # "
1381     $self->{nc} == 0x0027) { # '
1382    
1383     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1384     } else {
1385    
1386     }
1387     $self->{ca}
1388     = {name => chr ($self->{nc}),
1389     value => '',
1390     line => $self->{line}, column => $self->{column}};
1391     $self->{state} = ATTRIBUTE_NAME_STATE;
1392    
1393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1394     $self->{line_prev} = $self->{line};
1395     $self->{column_prev} = $self->{column};
1396     $self->{column}++;
1397     $self->{nc}
1398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1399     } else {
1400     $self->{set_nc}->($self);
1401     }
1402    
1403     redo A;
1404     }
1405     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1406     if ($is_space->{$self->{nc}}) {
1407    
1408     ## Stay in the state
1409    
1410     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1411     $self->{line_prev} = $self->{line};
1412     $self->{column_prev} = $self->{column};
1413     $self->{column}++;
1414     $self->{nc}
1415     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1416     } else {
1417     $self->{set_nc}->($self);
1418     }
1419    
1420     redo A;
1421     } elsif ($self->{nc} == 0x0022) { # "
1422    
1423     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1424    
1425     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1426     $self->{line_prev} = $self->{line};
1427     $self->{column_prev} = $self->{column};
1428     $self->{column}++;
1429     $self->{nc}
1430     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1431     } else {
1432     $self->{set_nc}->($self);
1433     }
1434    
1435     redo A;
1436     } elsif ($self->{nc} == 0x0026) { # &
1437    
1438     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1439     ## reconsume
1440     redo A;
1441     } elsif ($self->{nc} == 0x0027) { # '
1442    
1443     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1444    
1445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1446     $self->{line_prev} = $self->{line};
1447     $self->{column_prev} = $self->{column};
1448     $self->{column}++;
1449     $self->{nc}
1450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1451     } else {
1452     $self->{set_nc}->($self);
1453     }
1454    
1455     redo A;
1456     } elsif ($self->{nc} == 0x003E) { # >
1457     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1458     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1459    
1460     $self->{last_stag_name} = $self->{ct}->{tag_name};
1461     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1462     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1463     if ($self->{ct}->{attributes}) {
1464    
1465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1466     } else {
1467     ## NOTE: This state should never be reached.
1468    
1469     }
1470     } else {
1471     die "$0: $self->{ct}->{type}: Unknown token type";
1472     }
1473     $self->{state} = DATA_STATE;
1474 wakaba 1.5 $self->{s_kwd} = '';
1475 wakaba 1.1
1476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1477     $self->{line_prev} = $self->{line};
1478     $self->{column_prev} = $self->{column};
1479     $self->{column}++;
1480     $self->{nc}
1481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1482     } else {
1483     $self->{set_nc}->($self);
1484     }
1485    
1486    
1487     return ($self->{ct}); # start tag or end tag
1488    
1489     redo A;
1490     } elsif ($self->{nc} == -1) {
1491     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1492     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1493    
1494     $self->{last_stag_name} = $self->{ct}->{tag_name};
1495     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1496     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1497     if ($self->{ct}->{attributes}) {
1498    
1499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1500     } else {
1501     ## NOTE: This state should never be reached.
1502    
1503     }
1504     } else {
1505     die "$0: $self->{ct}->{type}: Unknown token type";
1506     }
1507     $self->{state} = DATA_STATE;
1508 wakaba 1.5 $self->{s_kwd} = '';
1509 wakaba 1.1 ## reconsume
1510    
1511     return ($self->{ct}); # start tag or end tag
1512    
1513     redo A;
1514     } else {
1515     if ($self->{nc} == 0x003D) { # =
1516    
1517     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1518     } else {
1519    
1520     }
1521     $self->{ca}->{value} .= chr ($self->{nc});
1522     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1523    
1524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1525     $self->{line_prev} = $self->{line};
1526     $self->{column_prev} = $self->{column};
1527     $self->{column}++;
1528     $self->{nc}
1529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1530     } else {
1531     $self->{set_nc}->($self);
1532     }
1533    
1534     redo A;
1535     }
1536     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1537     if ($self->{nc} == 0x0022) { # "
1538    
1539     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1540    
1541     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1542     $self->{line_prev} = $self->{line};
1543     $self->{column_prev} = $self->{column};
1544     $self->{column}++;
1545     $self->{nc}
1546     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1547     } else {
1548     $self->{set_nc}->($self);
1549     }
1550    
1551     redo A;
1552     } elsif ($self->{nc} == 0x0026) { # &
1553    
1554     ## NOTE: In the spec, the tokenizer is switched to the
1555     ## "entity in attribute value state". In this implementation, the
1556     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1557     ## implementation of the "consume a character reference" algorithm.
1558     $self->{prev_state} = $self->{state};
1559     $self->{entity_add} = 0x0022; # "
1560     $self->{state} = ENTITY_STATE;
1561    
1562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1563     $self->{line_prev} = $self->{line};
1564     $self->{column_prev} = $self->{column};
1565     $self->{column}++;
1566     $self->{nc}
1567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1568     } else {
1569     $self->{set_nc}->($self);
1570     }
1571    
1572     redo A;
1573     } elsif ($self->{nc} == -1) {
1574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1575     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1576    
1577     $self->{last_stag_name} = $self->{ct}->{tag_name};
1578     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1579     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1580     if ($self->{ct}->{attributes}) {
1581    
1582     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1583     } else {
1584     ## NOTE: This state should never be reached.
1585    
1586     }
1587     } else {
1588     die "$0: $self->{ct}->{type}: Unknown token type";
1589     }
1590     $self->{state} = DATA_STATE;
1591 wakaba 1.5 $self->{s_kwd} = '';
1592 wakaba 1.1 ## reconsume
1593    
1594     return ($self->{ct}); # start tag or end tag
1595    
1596     redo A;
1597     } else {
1598    
1599     $self->{ca}->{value} .= chr ($self->{nc});
1600     $self->{read_until}->($self->{ca}->{value},
1601     q["&],
1602     length $self->{ca}->{value});
1603    
1604     ## Stay in the state
1605    
1606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1607     $self->{line_prev} = $self->{line};
1608     $self->{column_prev} = $self->{column};
1609     $self->{column}++;
1610     $self->{nc}
1611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1612     } else {
1613     $self->{set_nc}->($self);
1614     }
1615    
1616     redo A;
1617     }
1618     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1619     if ($self->{nc} == 0x0027) { # '
1620    
1621     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1622    
1623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1624     $self->{line_prev} = $self->{line};
1625     $self->{column_prev} = $self->{column};
1626     $self->{column}++;
1627     $self->{nc}
1628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1629     } else {
1630     $self->{set_nc}->($self);
1631     }
1632    
1633     redo A;
1634     } elsif ($self->{nc} == 0x0026) { # &
1635    
1636     ## NOTE: In the spec, the tokenizer is switched to the
1637     ## "entity in attribute value state". In this implementation, the
1638     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1639     ## implementation of the "consume a character reference" algorithm.
1640     $self->{entity_add} = 0x0027; # '
1641     $self->{prev_state} = $self->{state};
1642     $self->{state} = ENTITY_STATE;
1643    
1644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1645     $self->{line_prev} = $self->{line};
1646     $self->{column_prev} = $self->{column};
1647     $self->{column}++;
1648     $self->{nc}
1649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1650     } else {
1651     $self->{set_nc}->($self);
1652     }
1653    
1654     redo A;
1655     } elsif ($self->{nc} == -1) {
1656     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1657     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1658    
1659     $self->{last_stag_name} = $self->{ct}->{tag_name};
1660     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1661     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1662     if ($self->{ct}->{attributes}) {
1663    
1664     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1665     } else {
1666     ## NOTE: This state should never be reached.
1667    
1668     }
1669     } else {
1670     die "$0: $self->{ct}->{type}: Unknown token type";
1671     }
1672     $self->{state} = DATA_STATE;
1673 wakaba 1.5 $self->{s_kwd} = '';
1674 wakaba 1.1 ## reconsume
1675    
1676     return ($self->{ct}); # start tag or end tag
1677    
1678     redo A;
1679     } else {
1680    
1681     $self->{ca}->{value} .= chr ($self->{nc});
1682     $self->{read_until}->($self->{ca}->{value},
1683     q['&],
1684     length $self->{ca}->{value});
1685    
1686     ## Stay in the state
1687    
1688     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1689     $self->{line_prev} = $self->{line};
1690     $self->{column_prev} = $self->{column};
1691     $self->{column}++;
1692     $self->{nc}
1693     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1694     } else {
1695     $self->{set_nc}->($self);
1696     }
1697    
1698     redo A;
1699     }
1700     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1701     if ($is_space->{$self->{nc}}) {
1702    
1703     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1704    
1705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1706     $self->{line_prev} = $self->{line};
1707     $self->{column_prev} = $self->{column};
1708     $self->{column}++;
1709     $self->{nc}
1710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1711     } else {
1712     $self->{set_nc}->($self);
1713     }
1714    
1715     redo A;
1716     } elsif ($self->{nc} == 0x0026) { # &
1717    
1718     ## NOTE: In the spec, the tokenizer is switched to the
1719     ## "entity in attribute value state". In this implementation, the
1720     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1721     ## implementation of the "consume a character reference" algorithm.
1722     $self->{entity_add} = -1;
1723     $self->{prev_state} = $self->{state};
1724     $self->{state} = ENTITY_STATE;
1725    
1726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1727     $self->{line_prev} = $self->{line};
1728     $self->{column_prev} = $self->{column};
1729     $self->{column}++;
1730     $self->{nc}
1731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1732     } else {
1733     $self->{set_nc}->($self);
1734     }
1735    
1736     redo A;
1737     } elsif ($self->{nc} == 0x003E) { # >
1738     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1739    
1740     $self->{last_stag_name} = $self->{ct}->{tag_name};
1741     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1742     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1743     if ($self->{ct}->{attributes}) {
1744    
1745     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1746     } else {
1747     ## NOTE: This state should never be reached.
1748    
1749     }
1750     } else {
1751     die "$0: $self->{ct}->{type}: Unknown token type";
1752     }
1753     $self->{state} = DATA_STATE;
1754 wakaba 1.5 $self->{s_kwd} = '';
1755 wakaba 1.1
1756     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1757     $self->{line_prev} = $self->{line};
1758     $self->{column_prev} = $self->{column};
1759     $self->{column}++;
1760     $self->{nc}
1761     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1762     } else {
1763     $self->{set_nc}->($self);
1764     }
1765    
1766    
1767     return ($self->{ct}); # start tag or end tag
1768    
1769     redo A;
1770     } elsif ($self->{nc} == -1) {
1771     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1772     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1773    
1774     $self->{last_stag_name} = $self->{ct}->{tag_name};
1775     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1776     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1777     if ($self->{ct}->{attributes}) {
1778    
1779     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1780     } else {
1781     ## NOTE: This state should never be reached.
1782    
1783     }
1784     } else {
1785     die "$0: $self->{ct}->{type}: Unknown token type";
1786     }
1787     $self->{state} = DATA_STATE;
1788 wakaba 1.5 $self->{s_kwd} = '';
1789 wakaba 1.1 ## reconsume
1790    
1791     return ($self->{ct}); # start tag or end tag
1792    
1793     redo A;
1794     } else {
1795     if ({
1796     0x0022 => 1, # "
1797     0x0027 => 1, # '
1798     0x003D => 1, # =
1799     }->{$self->{nc}}) {
1800    
1801     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1802     } else {
1803    
1804     }
1805     $self->{ca}->{value} .= chr ($self->{nc});
1806     $self->{read_until}->($self->{ca}->{value},
1807     q["'=& >],
1808     length $self->{ca}->{value});
1809    
1810     ## Stay in the state
1811    
1812     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1813     $self->{line_prev} = $self->{line};
1814     $self->{column_prev} = $self->{column};
1815     $self->{column}++;
1816     $self->{nc}
1817     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1818     } else {
1819     $self->{set_nc}->($self);
1820     }
1821    
1822     redo A;
1823     }
1824     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1825     if ($is_space->{$self->{nc}}) {
1826    
1827     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1828    
1829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1830     $self->{line_prev} = $self->{line};
1831     $self->{column_prev} = $self->{column};
1832     $self->{column}++;
1833     $self->{nc}
1834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1835     } else {
1836     $self->{set_nc}->($self);
1837     }
1838    
1839     redo A;
1840     } elsif ($self->{nc} == 0x003E) { # >
1841     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842    
1843     $self->{last_stag_name} = $self->{ct}->{tag_name};
1844     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1846     if ($self->{ct}->{attributes}) {
1847    
1848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1849     } else {
1850     ## NOTE: This state should never be reached.
1851    
1852     }
1853     } else {
1854     die "$0: $self->{ct}->{type}: Unknown token type";
1855     }
1856     $self->{state} = DATA_STATE;
1857 wakaba 1.5 $self->{s_kwd} = '';
1858 wakaba 1.1
1859     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1860     $self->{line_prev} = $self->{line};
1861     $self->{column_prev} = $self->{column};
1862     $self->{column}++;
1863     $self->{nc}
1864     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1865     } else {
1866     $self->{set_nc}->($self);
1867     }
1868    
1869    
1870     return ($self->{ct}); # start tag or end tag
1871    
1872     redo A;
1873     } elsif ($self->{nc} == 0x002F) { # /
1874    
1875     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1876    
1877     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1878     $self->{line_prev} = $self->{line};
1879     $self->{column_prev} = $self->{column};
1880     $self->{column}++;
1881     $self->{nc}
1882     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1883     } else {
1884     $self->{set_nc}->($self);
1885     }
1886    
1887     redo A;
1888     } elsif ($self->{nc} == -1) {
1889     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1890     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1891    
1892     $self->{last_stag_name} = $self->{ct}->{tag_name};
1893     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1894     if ($self->{ct}->{attributes}) {
1895    
1896     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1897     } else {
1898     ## NOTE: This state should never be reached.
1899    
1900     }
1901     } else {
1902     die "$0: $self->{ct}->{type}: Unknown token type";
1903     }
1904     $self->{state} = DATA_STATE;
1905 wakaba 1.5 $self->{s_kwd} = '';
1906 wakaba 1.1 ## Reconsume.
1907     return ($self->{ct}); # start tag or end tag
1908     redo A;
1909     } else {
1910    
1911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1912     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1913     ## reconsume
1914     redo A;
1915     }
1916     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1917     if ($self->{nc} == 0x003E) { # >
1918     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1919    
1920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1921     ## TODO: Different type than slash in start tag
1922     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1923     if ($self->{ct}->{attributes}) {
1924    
1925     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1926     } else {
1927    
1928     }
1929     ## TODO: Test |<title></title/>|
1930     } else {
1931    
1932     $self->{self_closing} = 1;
1933     }
1934    
1935     $self->{state} = DATA_STATE;
1936 wakaba 1.5 $self->{s_kwd} = '';
1937 wakaba 1.1
1938     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1939     $self->{line_prev} = $self->{line};
1940     $self->{column_prev} = $self->{column};
1941     $self->{column}++;
1942     $self->{nc}
1943     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1944     } else {
1945     $self->{set_nc}->($self);
1946     }
1947    
1948    
1949     return ($self->{ct}); # start tag or end tag
1950    
1951     redo A;
1952     } elsif ($self->{nc} == -1) {
1953     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1954     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1955    
1956     $self->{last_stag_name} = $self->{ct}->{tag_name};
1957     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1958     if ($self->{ct}->{attributes}) {
1959    
1960     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1961     } else {
1962     ## NOTE: This state should never be reached.
1963    
1964     }
1965     } else {
1966     die "$0: $self->{ct}->{type}: Unknown token type";
1967     }
1968     $self->{state} = DATA_STATE;
1969 wakaba 1.5 $self->{s_kwd} = '';
1970 wakaba 1.1 ## Reconsume.
1971     return ($self->{ct}); # start tag or end tag
1972     redo A;
1973     } else {
1974    
1975     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1976     ## TODO: This error type is wrong.
1977     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1978     ## Reconsume.
1979     redo A;
1980     }
1981     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1982     ## (only happen if PCDATA state)
1983    
1984     ## NOTE: Unlike spec's "bogus comment state", this implementation
1985     ## consumes characters one-by-one basis.
1986    
1987     if ($self->{nc} == 0x003E) { # >
1988    
1989     $self->{state} = DATA_STATE;
1990 wakaba 1.5 $self->{s_kwd} = '';
1991 wakaba 1.1
1992     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1993     $self->{line_prev} = $self->{line};
1994     $self->{column_prev} = $self->{column};
1995     $self->{column}++;
1996     $self->{nc}
1997     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1998     } else {
1999     $self->{set_nc}->($self);
2000     }
2001    
2002    
2003     return ($self->{ct}); # comment
2004     redo A;
2005     } elsif ($self->{nc} == -1) {
2006    
2007     $self->{state} = DATA_STATE;
2008 wakaba 1.5 $self->{s_kwd} = '';
2009 wakaba 1.1 ## reconsume
2010    
2011     return ($self->{ct}); # comment
2012     redo A;
2013     } else {
2014    
2015     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2016     $self->{read_until}->($self->{ct}->{data},
2017     q[>],
2018     length $self->{ct}->{data});
2019    
2020     ## Stay in the state.
2021    
2022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2023     $self->{line_prev} = $self->{line};
2024     $self->{column_prev} = $self->{column};
2025     $self->{column}++;
2026     $self->{nc}
2027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2028     } else {
2029     $self->{set_nc}->($self);
2030     }
2031    
2032     redo A;
2033     }
2034     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2035     ## (only happen if PCDATA state)
2036    
2037     if ($self->{nc} == 0x002D) { # -
2038    
2039     $self->{state} = MD_HYPHEN_STATE;
2040    
2041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2042     $self->{line_prev} = $self->{line};
2043     $self->{column_prev} = $self->{column};
2044     $self->{column}++;
2045     $self->{nc}
2046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2047     } else {
2048     $self->{set_nc}->($self);
2049     }
2050    
2051     redo A;
2052     } elsif ($self->{nc} == 0x0044 or # D
2053     $self->{nc} == 0x0064) { # d
2054     ## ASCII case-insensitive.
2055    
2056     $self->{state} = MD_DOCTYPE_STATE;
2057     $self->{s_kwd} = chr $self->{nc};
2058    
2059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2060     $self->{line_prev} = $self->{line};
2061     $self->{column_prev} = $self->{column};
2062     $self->{column}++;
2063     $self->{nc}
2064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2065     } else {
2066     $self->{set_nc}->($self);
2067     }
2068    
2069     redo A;
2070 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2071     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2072     $self->{is_xml}) and
2073 wakaba 1.1 $self->{nc} == 0x005B) { # [
2074    
2075     $self->{state} = MD_CDATA_STATE;
2076     $self->{s_kwd} = '[';
2077    
2078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2079     $self->{line_prev} = $self->{line};
2080     $self->{column_prev} = $self->{column};
2081     $self->{column}++;
2082     $self->{nc}
2083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2084     } else {
2085     $self->{set_nc}->($self);
2086     }
2087    
2088     redo A;
2089     } else {
2090    
2091     }
2092    
2093     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2094     line => $self->{line_prev},
2095     column => $self->{column_prev} - 1);
2096     ## Reconsume.
2097     $self->{state} = BOGUS_COMMENT_STATE;
2098     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2099     line => $self->{line_prev},
2100     column => $self->{column_prev} - 1,
2101     };
2102     redo A;
2103     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2104     if ($self->{nc} == 0x002D) { # -
2105    
2106     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2107     line => $self->{line_prev},
2108     column => $self->{column_prev} - 2,
2109     };
2110     $self->{state} = COMMENT_START_STATE;
2111    
2112     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2113     $self->{line_prev} = $self->{line};
2114     $self->{column_prev} = $self->{column};
2115     $self->{column}++;
2116     $self->{nc}
2117     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2118     } else {
2119     $self->{set_nc}->($self);
2120     }
2121    
2122     redo A;
2123     } else {
2124    
2125     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2126     line => $self->{line_prev},
2127     column => $self->{column_prev} - 2);
2128     $self->{state} = BOGUS_COMMENT_STATE;
2129     ## Reconsume.
2130     $self->{ct} = {type => COMMENT_TOKEN,
2131     data => '-',
2132     line => $self->{line_prev},
2133     column => $self->{column_prev} - 2,
2134     };
2135     redo A;
2136     }
2137     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2138     ## ASCII case-insensitive.
2139     if ($self->{nc} == [
2140     undef,
2141     0x004F, # O
2142     0x0043, # C
2143     0x0054, # T
2144     0x0059, # Y
2145     0x0050, # P
2146     ]->[length $self->{s_kwd}] or
2147     $self->{nc} == [
2148     undef,
2149     0x006F, # o
2150     0x0063, # c
2151     0x0074, # t
2152     0x0079, # y
2153     0x0070, # p
2154     ]->[length $self->{s_kwd}]) {
2155    
2156     ## Stay in the state.
2157     $self->{s_kwd} .= chr $self->{nc};
2158    
2159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2160     $self->{line_prev} = $self->{line};
2161     $self->{column_prev} = $self->{column};
2162     $self->{column}++;
2163     $self->{nc}
2164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2165     } else {
2166     $self->{set_nc}->($self);
2167     }
2168    
2169     redo A;
2170     } elsif ((length $self->{s_kwd}) == 6 and
2171     ($self->{nc} == 0x0045 or # E
2172     $self->{nc} == 0x0065)) { # e
2173    
2174     $self->{state} = DOCTYPE_STATE;
2175     $self->{ct} = {type => DOCTYPE_TOKEN,
2176     quirks => 1,
2177     line => $self->{line_prev},
2178     column => $self->{column_prev} - 7,
2179     };
2180    
2181     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2182     $self->{line_prev} = $self->{line};
2183     $self->{column_prev} = $self->{column};
2184     $self->{column}++;
2185     $self->{nc}
2186     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2187     } else {
2188     $self->{set_nc}->($self);
2189     }
2190    
2191     redo A;
2192     } else {
2193    
2194     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2195     line => $self->{line_prev},
2196     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2197     $self->{state} = BOGUS_COMMENT_STATE;
2198     ## Reconsume.
2199     $self->{ct} = {type => COMMENT_TOKEN,
2200     data => $self->{s_kwd},
2201     line => $self->{line_prev},
2202     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2203     };
2204     redo A;
2205     }
2206     } elsif ($self->{state} == MD_CDATA_STATE) {
2207     if ($self->{nc} == {
2208     '[' => 0x0043, # C
2209     '[C' => 0x0044, # D
2210     '[CD' => 0x0041, # A
2211     '[CDA' => 0x0054, # T
2212     '[CDAT' => 0x0041, # A
2213     }->{$self->{s_kwd}}) {
2214    
2215     ## Stay in the state.
2216     $self->{s_kwd} .= chr $self->{nc};
2217    
2218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2219     $self->{line_prev} = $self->{line};
2220     $self->{column_prev} = $self->{column};
2221     $self->{column}++;
2222     $self->{nc}
2223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2224     } else {
2225     $self->{set_nc}->($self);
2226     }
2227    
2228     redo A;
2229     } elsif ($self->{s_kwd} eq '[CDATA' and
2230     $self->{nc} == 0x005B) { # [
2231    
2232 wakaba 1.6
2233     if ($self->{is_xml} and
2234     not $self->{tainted} and
2235     @{$self->{open_elements} or []} == 0) {
2236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2237     line => $self->{line_prev},
2238     column => $self->{column_prev} - 7);
2239     $self->{tainted} = 1;
2240     }
2241    
2242 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2243     data => '',
2244     line => $self->{line_prev},
2245     column => $self->{column_prev} - 7};
2246     $self->{state} = CDATA_SECTION_STATE;
2247    
2248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249     $self->{line_prev} = $self->{line};
2250     $self->{column_prev} = $self->{column};
2251     $self->{column}++;
2252     $self->{nc}
2253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2254     } else {
2255     $self->{set_nc}->($self);
2256     }
2257    
2258     redo A;
2259     } else {
2260    
2261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2262     line => $self->{line_prev},
2263     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2264     $self->{state} = BOGUS_COMMENT_STATE;
2265     ## Reconsume.
2266     $self->{ct} = {type => COMMENT_TOKEN,
2267     data => $self->{s_kwd},
2268     line => $self->{line_prev},
2269     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2270     };
2271     redo A;
2272     }
2273     } elsif ($self->{state} == COMMENT_START_STATE) {
2274     if ($self->{nc} == 0x002D) { # -
2275    
2276     $self->{state} = COMMENT_START_DASH_STATE;
2277    
2278     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2279     $self->{line_prev} = $self->{line};
2280     $self->{column_prev} = $self->{column};
2281     $self->{column}++;
2282     $self->{nc}
2283     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2284     } else {
2285     $self->{set_nc}->($self);
2286     }
2287    
2288     redo A;
2289     } elsif ($self->{nc} == 0x003E) { # >
2290    
2291     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2292     $self->{state} = DATA_STATE;
2293 wakaba 1.5 $self->{s_kwd} = '';
2294 wakaba 1.1
2295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2296     $self->{line_prev} = $self->{line};
2297     $self->{column_prev} = $self->{column};
2298     $self->{column}++;
2299     $self->{nc}
2300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2301     } else {
2302     $self->{set_nc}->($self);
2303     }
2304    
2305    
2306     return ($self->{ct}); # comment
2307    
2308     redo A;
2309     } elsif ($self->{nc} == -1) {
2310    
2311     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2312     $self->{state} = DATA_STATE;
2313 wakaba 1.5 $self->{s_kwd} = '';
2314 wakaba 1.1 ## reconsume
2315    
2316     return ($self->{ct}); # comment
2317    
2318     redo A;
2319     } else {
2320    
2321     $self->{ct}->{data} # comment
2322     .= chr ($self->{nc});
2323     $self->{state} = COMMENT_STATE;
2324    
2325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2326     $self->{line_prev} = $self->{line};
2327     $self->{column_prev} = $self->{column};
2328     $self->{column}++;
2329     $self->{nc}
2330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2331     } else {
2332     $self->{set_nc}->($self);
2333     }
2334    
2335     redo A;
2336     }
2337     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2338     if ($self->{nc} == 0x002D) { # -
2339    
2340     $self->{state} = COMMENT_END_STATE;
2341    
2342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2343     $self->{line_prev} = $self->{line};
2344     $self->{column_prev} = $self->{column};
2345     $self->{column}++;
2346     $self->{nc}
2347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2348     } else {
2349     $self->{set_nc}->($self);
2350     }
2351    
2352     redo A;
2353     } elsif ($self->{nc} == 0x003E) { # >
2354    
2355     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2356     $self->{state} = DATA_STATE;
2357 wakaba 1.5 $self->{s_kwd} = '';
2358 wakaba 1.1
2359     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2360     $self->{line_prev} = $self->{line};
2361     $self->{column_prev} = $self->{column};
2362     $self->{column}++;
2363     $self->{nc}
2364     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2365     } else {
2366     $self->{set_nc}->($self);
2367     }
2368    
2369    
2370     return ($self->{ct}); # comment
2371    
2372     redo A;
2373     } elsif ($self->{nc} == -1) {
2374    
2375     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2376     $self->{state} = DATA_STATE;
2377 wakaba 1.5 $self->{s_kwd} = '';
2378 wakaba 1.1 ## reconsume
2379    
2380     return ($self->{ct}); # comment
2381    
2382     redo A;
2383     } else {
2384    
2385     $self->{ct}->{data} # comment
2386     .= '-' . chr ($self->{nc});
2387     $self->{state} = COMMENT_STATE;
2388    
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399     redo A;
2400     }
2401     } elsif ($self->{state} == COMMENT_STATE) {
2402     if ($self->{nc} == 0x002D) { # -
2403    
2404     $self->{state} = COMMENT_END_DASH_STATE;
2405    
2406     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2407     $self->{line_prev} = $self->{line};
2408     $self->{column_prev} = $self->{column};
2409     $self->{column}++;
2410     $self->{nc}
2411     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2412     } else {
2413     $self->{set_nc}->($self);
2414     }
2415    
2416     redo A;
2417     } elsif ($self->{nc} == -1) {
2418    
2419     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2420     $self->{state} = DATA_STATE;
2421 wakaba 1.5 $self->{s_kwd} = '';
2422 wakaba 1.1 ## reconsume
2423    
2424     return ($self->{ct}); # comment
2425    
2426     redo A;
2427     } else {
2428    
2429     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2430     $self->{read_until}->($self->{ct}->{data},
2431     q[-],
2432     length $self->{ct}->{data});
2433    
2434     ## Stay in the state
2435    
2436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2437     $self->{line_prev} = $self->{line};
2438     $self->{column_prev} = $self->{column};
2439     $self->{column}++;
2440     $self->{nc}
2441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2442     } else {
2443     $self->{set_nc}->($self);
2444     }
2445    
2446     redo A;
2447     }
2448     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2449     if ($self->{nc} == 0x002D) { # -
2450    
2451     $self->{state} = COMMENT_END_STATE;
2452    
2453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2454     $self->{line_prev} = $self->{line};
2455     $self->{column_prev} = $self->{column};
2456     $self->{column}++;
2457     $self->{nc}
2458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2459     } else {
2460     $self->{set_nc}->($self);
2461     }
2462    
2463     redo A;
2464     } elsif ($self->{nc} == -1) {
2465    
2466     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2467 wakaba 1.5 $self->{s_kwd} = '';
2468 wakaba 1.1 $self->{state} = DATA_STATE;
2469 wakaba 1.5 $self->{s_kwd} = '';
2470 wakaba 1.1 ## reconsume
2471    
2472     return ($self->{ct}); # comment
2473    
2474     redo A;
2475     } else {
2476    
2477     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2478     $self->{state} = COMMENT_STATE;
2479    
2480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481     $self->{line_prev} = $self->{line};
2482     $self->{column_prev} = $self->{column};
2483     $self->{column}++;
2484     $self->{nc}
2485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2486     } else {
2487     $self->{set_nc}->($self);
2488     }
2489    
2490     redo A;
2491     }
2492     } elsif ($self->{state} == COMMENT_END_STATE) {
2493     if ($self->{nc} == 0x003E) { # >
2494    
2495     $self->{state} = DATA_STATE;
2496 wakaba 1.5 $self->{s_kwd} = '';
2497 wakaba 1.1
2498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2499     $self->{line_prev} = $self->{line};
2500     $self->{column_prev} = $self->{column};
2501     $self->{column}++;
2502     $self->{nc}
2503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2504     } else {
2505     $self->{set_nc}->($self);
2506     }
2507    
2508    
2509     return ($self->{ct}); # comment
2510    
2511     redo A;
2512     } elsif ($self->{nc} == 0x002D) { # -
2513    
2514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2515     line => $self->{line_prev},
2516     column => $self->{column_prev});
2517     $self->{ct}->{data} .= '-'; # comment
2518     ## Stay in the state
2519    
2520     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2521     $self->{line_prev} = $self->{line};
2522     $self->{column_prev} = $self->{column};
2523     $self->{column}++;
2524     $self->{nc}
2525     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2526     } else {
2527     $self->{set_nc}->($self);
2528     }
2529    
2530     redo A;
2531     } elsif ($self->{nc} == -1) {
2532    
2533     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2534     $self->{state} = DATA_STATE;
2535 wakaba 1.5 $self->{s_kwd} = '';
2536 wakaba 1.1 ## reconsume
2537    
2538     return ($self->{ct}); # comment
2539    
2540     redo A;
2541     } else {
2542    
2543     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2544     line => $self->{line_prev},
2545     column => $self->{column_prev});
2546     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2547     $self->{state} = COMMENT_STATE;
2548    
2549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2550     $self->{line_prev} = $self->{line};
2551     $self->{column_prev} = $self->{column};
2552     $self->{column}++;
2553     $self->{nc}
2554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2555     } else {
2556     $self->{set_nc}->($self);
2557     }
2558    
2559     redo A;
2560     }
2561     } elsif ($self->{state} == DOCTYPE_STATE) {
2562     if ($is_space->{$self->{nc}}) {
2563    
2564     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2565    
2566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2567     $self->{line_prev} = $self->{line};
2568     $self->{column_prev} = $self->{column};
2569     $self->{column}++;
2570     $self->{nc}
2571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2572     } else {
2573     $self->{set_nc}->($self);
2574     }
2575    
2576     redo A;
2577     } else {
2578    
2579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2580     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2581     ## reconsume
2582     redo A;
2583     }
2584     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2585     if ($is_space->{$self->{nc}}) {
2586    
2587     ## Stay in the state
2588    
2589     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2590     $self->{line_prev} = $self->{line};
2591     $self->{column_prev} = $self->{column};
2592     $self->{column}++;
2593     $self->{nc}
2594     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2595     } else {
2596     $self->{set_nc}->($self);
2597     }
2598    
2599     redo A;
2600     } elsif ($self->{nc} == 0x003E) { # >
2601    
2602     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2603     $self->{state} = DATA_STATE;
2604 wakaba 1.5 $self->{s_kwd} = '';
2605 wakaba 1.1
2606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2607     $self->{line_prev} = $self->{line};
2608     $self->{column_prev} = $self->{column};
2609     $self->{column}++;
2610     $self->{nc}
2611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2612     } else {
2613     $self->{set_nc}->($self);
2614     }
2615    
2616    
2617     return ($self->{ct}); # DOCTYPE (quirks)
2618    
2619     redo A;
2620     } elsif ($self->{nc} == -1) {
2621    
2622     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2623     $self->{state} = DATA_STATE;
2624 wakaba 1.5 $self->{s_kwd} = '';
2625 wakaba 1.1 ## reconsume
2626    
2627     return ($self->{ct}); # DOCTYPE (quirks)
2628    
2629     redo A;
2630     } else {
2631    
2632     $self->{ct}->{name} = chr $self->{nc};
2633     delete $self->{ct}->{quirks};
2634     $self->{state} = DOCTYPE_NAME_STATE;
2635    
2636     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2637     $self->{line_prev} = $self->{line};
2638     $self->{column_prev} = $self->{column};
2639     $self->{column}++;
2640     $self->{nc}
2641     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2642     } else {
2643     $self->{set_nc}->($self);
2644     }
2645    
2646     redo A;
2647     }
2648     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2649     ## ISSUE: Redundant "First," in the spec.
2650     if ($is_space->{$self->{nc}}) {
2651    
2652     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2653    
2654     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2655     $self->{line_prev} = $self->{line};
2656     $self->{column_prev} = $self->{column};
2657     $self->{column}++;
2658     $self->{nc}
2659     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2660     } else {
2661     $self->{set_nc}->($self);
2662     }
2663    
2664     redo A;
2665     } elsif ($self->{nc} == 0x003E) { # >
2666    
2667     $self->{state} = DATA_STATE;
2668 wakaba 1.5 $self->{s_kwd} = '';
2669 wakaba 1.1
2670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2671     $self->{line_prev} = $self->{line};
2672     $self->{column_prev} = $self->{column};
2673     $self->{column}++;
2674     $self->{nc}
2675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2676     } else {
2677     $self->{set_nc}->($self);
2678     }
2679    
2680    
2681     return ($self->{ct}); # DOCTYPE
2682    
2683     redo A;
2684     } elsif ($self->{nc} == -1) {
2685    
2686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2687     $self->{state} = DATA_STATE;
2688 wakaba 1.5 $self->{s_kwd} = '';
2689 wakaba 1.1 ## reconsume
2690    
2691     $self->{ct}->{quirks} = 1;
2692     return ($self->{ct}); # DOCTYPE
2693    
2694     redo A;
2695     } else {
2696    
2697     $self->{ct}->{name}
2698     .= chr ($self->{nc}); # DOCTYPE
2699     ## Stay in the state
2700    
2701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2702     $self->{line_prev} = $self->{line};
2703     $self->{column_prev} = $self->{column};
2704     $self->{column}++;
2705     $self->{nc}
2706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2707     } else {
2708     $self->{set_nc}->($self);
2709     }
2710    
2711     redo A;
2712     }
2713     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2714     if ($is_space->{$self->{nc}}) {
2715    
2716     ## Stay in the state
2717    
2718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2719     $self->{line_prev} = $self->{line};
2720     $self->{column_prev} = $self->{column};
2721     $self->{column}++;
2722     $self->{nc}
2723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2724     } else {
2725     $self->{set_nc}->($self);
2726     }
2727    
2728     redo A;
2729     } elsif ($self->{nc} == 0x003E) { # >
2730    
2731     $self->{state} = DATA_STATE;
2732 wakaba 1.5 $self->{s_kwd} = '';
2733 wakaba 1.1
2734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2735     $self->{line_prev} = $self->{line};
2736     $self->{column_prev} = $self->{column};
2737     $self->{column}++;
2738     $self->{nc}
2739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2740     } else {
2741     $self->{set_nc}->($self);
2742     }
2743    
2744    
2745     return ($self->{ct}); # DOCTYPE
2746    
2747     redo A;
2748     } elsif ($self->{nc} == -1) {
2749    
2750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2751     $self->{state} = DATA_STATE;
2752 wakaba 1.5 $self->{s_kwd} = '';
2753 wakaba 1.1 ## reconsume
2754    
2755     $self->{ct}->{quirks} = 1;
2756     return ($self->{ct}); # DOCTYPE
2757    
2758     redo A;
2759     } elsif ($self->{nc} == 0x0050 or # P
2760     $self->{nc} == 0x0070) { # p
2761     $self->{state} = PUBLIC_STATE;
2762     $self->{s_kwd} = chr $self->{nc};
2763    
2764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765     $self->{line_prev} = $self->{line};
2766     $self->{column_prev} = $self->{column};
2767     $self->{column}++;
2768     $self->{nc}
2769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770     } else {
2771     $self->{set_nc}->($self);
2772     }
2773    
2774     redo A;
2775     } elsif ($self->{nc} == 0x0053 or # S
2776     $self->{nc} == 0x0073) { # s
2777     $self->{state} = SYSTEM_STATE;
2778     $self->{s_kwd} = chr $self->{nc};
2779    
2780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2781     $self->{line_prev} = $self->{line};
2782     $self->{column_prev} = $self->{column};
2783     $self->{column}++;
2784     $self->{nc}
2785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2786     } else {
2787     $self->{set_nc}->($self);
2788     }
2789    
2790     redo A;
2791     } else {
2792    
2793     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2794     $self->{ct}->{quirks} = 1;
2795    
2796     $self->{state} = BOGUS_DOCTYPE_STATE;
2797    
2798     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2799     $self->{line_prev} = $self->{line};
2800     $self->{column_prev} = $self->{column};
2801     $self->{column}++;
2802     $self->{nc}
2803     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2804     } else {
2805     $self->{set_nc}->($self);
2806     }
2807    
2808     redo A;
2809     }
2810     } elsif ($self->{state} == PUBLIC_STATE) {
2811     ## ASCII case-insensitive
2812     if ($self->{nc} == [
2813     undef,
2814     0x0055, # U
2815     0x0042, # B
2816     0x004C, # L
2817     0x0049, # I
2818     ]->[length $self->{s_kwd}] or
2819     $self->{nc} == [
2820     undef,
2821     0x0075, # u
2822     0x0062, # b
2823     0x006C, # l
2824     0x0069, # i
2825     ]->[length $self->{s_kwd}]) {
2826    
2827     ## Stay in the state.
2828     $self->{s_kwd} .= chr $self->{nc};
2829    
2830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2831     $self->{line_prev} = $self->{line};
2832     $self->{column_prev} = $self->{column};
2833     $self->{column}++;
2834     $self->{nc}
2835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2836     } else {
2837     $self->{set_nc}->($self);
2838     }
2839    
2840     redo A;
2841     } elsif ((length $self->{s_kwd}) == 5 and
2842     ($self->{nc} == 0x0043 or # C
2843     $self->{nc} == 0x0063)) { # c
2844    
2845     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2846    
2847     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2848     $self->{line_prev} = $self->{line};
2849     $self->{column_prev} = $self->{column};
2850     $self->{column}++;
2851     $self->{nc}
2852     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2853     } else {
2854     $self->{set_nc}->($self);
2855     }
2856    
2857     redo A;
2858     } else {
2859    
2860     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2861     line => $self->{line_prev},
2862     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2863     $self->{ct}->{quirks} = 1;
2864    
2865     $self->{state} = BOGUS_DOCTYPE_STATE;
2866     ## Reconsume.
2867     redo A;
2868     }
2869     } elsif ($self->{state} == SYSTEM_STATE) {
2870     ## ASCII case-insensitive
2871     if ($self->{nc} == [
2872     undef,
2873     0x0059, # Y
2874     0x0053, # S
2875     0x0054, # T
2876     0x0045, # E
2877     ]->[length $self->{s_kwd}] or
2878     $self->{nc} == [
2879     undef,
2880     0x0079, # y
2881     0x0073, # s
2882     0x0074, # t
2883     0x0065, # e
2884     ]->[length $self->{s_kwd}]) {
2885    
2886     ## Stay in the state.
2887     $self->{s_kwd} .= chr $self->{nc};
2888    
2889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2890     $self->{line_prev} = $self->{line};
2891     $self->{column_prev} = $self->{column};
2892     $self->{column}++;
2893     $self->{nc}
2894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2895     } else {
2896     $self->{set_nc}->($self);
2897     }
2898    
2899     redo A;
2900     } elsif ((length $self->{s_kwd}) == 5 and
2901     ($self->{nc} == 0x004D or # M
2902     $self->{nc} == 0x006D)) { # m
2903    
2904     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2905    
2906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2907     $self->{line_prev} = $self->{line};
2908     $self->{column_prev} = $self->{column};
2909     $self->{column}++;
2910     $self->{nc}
2911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2912     } else {
2913     $self->{set_nc}->($self);
2914     }
2915    
2916     redo A;
2917     } else {
2918    
2919     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2920     line => $self->{line_prev},
2921     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2922     $self->{ct}->{quirks} = 1;
2923    
2924     $self->{state} = BOGUS_DOCTYPE_STATE;
2925     ## Reconsume.
2926     redo A;
2927     }
2928     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2929     if ($is_space->{$self->{nc}}) {
2930    
2931     ## Stay in the state
2932    
2933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2934     $self->{line_prev} = $self->{line};
2935     $self->{column_prev} = $self->{column};
2936     $self->{column}++;
2937     $self->{nc}
2938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2939     } else {
2940     $self->{set_nc}->($self);
2941     }
2942    
2943     redo A;
2944     } elsif ($self->{nc} eq 0x0022) { # "
2945    
2946     $self->{ct}->{pubid} = ''; # DOCTYPE
2947     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2948    
2949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2950     $self->{line_prev} = $self->{line};
2951     $self->{column_prev} = $self->{column};
2952     $self->{column}++;
2953     $self->{nc}
2954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2955     } else {
2956     $self->{set_nc}->($self);
2957     }
2958    
2959     redo A;
2960     } elsif ($self->{nc} eq 0x0027) { # '
2961    
2962     $self->{ct}->{pubid} = ''; # DOCTYPE
2963     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2964    
2965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2966     $self->{line_prev} = $self->{line};
2967     $self->{column_prev} = $self->{column};
2968     $self->{column}++;
2969     $self->{nc}
2970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2971     } else {
2972     $self->{set_nc}->($self);
2973     }
2974    
2975     redo A;
2976     } elsif ($self->{nc} eq 0x003E) { # >
2977    
2978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2979    
2980     $self->{state} = DATA_STATE;
2981 wakaba 1.5 $self->{s_kwd} = '';
2982 wakaba 1.1
2983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2984     $self->{line_prev} = $self->{line};
2985     $self->{column_prev} = $self->{column};
2986     $self->{column}++;
2987     $self->{nc}
2988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2989     } else {
2990     $self->{set_nc}->($self);
2991     }
2992    
2993    
2994     $self->{ct}->{quirks} = 1;
2995     return ($self->{ct}); # DOCTYPE
2996    
2997     redo A;
2998     } elsif ($self->{nc} == -1) {
2999    
3000     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3001    
3002     $self->{state} = DATA_STATE;
3003 wakaba 1.5 $self->{s_kwd} = '';
3004 wakaba 1.1 ## reconsume
3005    
3006     $self->{ct}->{quirks} = 1;
3007     return ($self->{ct}); # DOCTYPE
3008    
3009     redo A;
3010     } else {
3011    
3012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3013     $self->{ct}->{quirks} = 1;
3014    
3015     $self->{state} = BOGUS_DOCTYPE_STATE;
3016    
3017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018     $self->{line_prev} = $self->{line};
3019     $self->{column_prev} = $self->{column};
3020     $self->{column}++;
3021     $self->{nc}
3022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023     } else {
3024     $self->{set_nc}->($self);
3025     }
3026    
3027     redo A;
3028     }
3029     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3030     if ($self->{nc} == 0x0022) { # "
3031    
3032     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3033    
3034     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3035     $self->{line_prev} = $self->{line};
3036     $self->{column_prev} = $self->{column};
3037     $self->{column}++;
3038     $self->{nc}
3039     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3040     } else {
3041     $self->{set_nc}->($self);
3042     }
3043    
3044     redo A;
3045     } elsif ($self->{nc} == 0x003E) { # >
3046    
3047     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3048    
3049     $self->{state} = DATA_STATE;
3050 wakaba 1.5 $self->{s_kwd} = '';
3051 wakaba 1.1
3052     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3053     $self->{line_prev} = $self->{line};
3054     $self->{column_prev} = $self->{column};
3055     $self->{column}++;
3056     $self->{nc}
3057     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3058     } else {
3059     $self->{set_nc}->($self);
3060     }
3061    
3062    
3063     $self->{ct}->{quirks} = 1;
3064     return ($self->{ct}); # DOCTYPE
3065    
3066     redo A;
3067     } elsif ($self->{nc} == -1) {
3068    
3069     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3070    
3071     $self->{state} = DATA_STATE;
3072 wakaba 1.5 $self->{s_kwd} = '';
3073 wakaba 1.1 ## reconsume
3074    
3075     $self->{ct}->{quirks} = 1;
3076     return ($self->{ct}); # DOCTYPE
3077    
3078     redo A;
3079     } else {
3080    
3081     $self->{ct}->{pubid} # DOCTYPE
3082     .= chr $self->{nc};
3083     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3084     length $self->{ct}->{pubid});
3085    
3086     ## Stay in the state
3087    
3088     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3089     $self->{line_prev} = $self->{line};
3090     $self->{column_prev} = $self->{column};
3091     $self->{column}++;
3092     $self->{nc}
3093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3094     } else {
3095     $self->{set_nc}->($self);
3096     }
3097    
3098     redo A;
3099     }
3100     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3101     if ($self->{nc} == 0x0027) { # '
3102    
3103     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == 0x003E) { # >
3117    
3118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3119    
3120     $self->{state} = DATA_STATE;
3121 wakaba 1.5 $self->{s_kwd} = '';
3122 wakaba 1.1
3123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3124     $self->{line_prev} = $self->{line};
3125     $self->{column_prev} = $self->{column};
3126     $self->{column}++;
3127     $self->{nc}
3128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3129     } else {
3130     $self->{set_nc}->($self);
3131     }
3132    
3133    
3134     $self->{ct}->{quirks} = 1;
3135     return ($self->{ct}); # DOCTYPE
3136    
3137     redo A;
3138     } elsif ($self->{nc} == -1) {
3139    
3140     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3141    
3142     $self->{state} = DATA_STATE;
3143 wakaba 1.5 $self->{s_kwd} = '';
3144 wakaba 1.1 ## reconsume
3145    
3146     $self->{ct}->{quirks} = 1;
3147     return ($self->{ct}); # DOCTYPE
3148    
3149     redo A;
3150     } else {
3151    
3152     $self->{ct}->{pubid} # DOCTYPE
3153     .= chr $self->{nc};
3154     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3155     length $self->{ct}->{pubid});
3156    
3157     ## Stay in the state
3158    
3159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3160     $self->{line_prev} = $self->{line};
3161     $self->{column_prev} = $self->{column};
3162     $self->{column}++;
3163     $self->{nc}
3164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3165     } else {
3166     $self->{set_nc}->($self);
3167     }
3168    
3169     redo A;
3170     }
3171     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3172     if ($is_space->{$self->{nc}}) {
3173    
3174     ## Stay in the state
3175    
3176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3177     $self->{line_prev} = $self->{line};
3178     $self->{column_prev} = $self->{column};
3179     $self->{column}++;
3180     $self->{nc}
3181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3182     } else {
3183     $self->{set_nc}->($self);
3184     }
3185    
3186     redo A;
3187     } elsif ($self->{nc} == 0x0022) { # "
3188    
3189     $self->{ct}->{sysid} = ''; # DOCTYPE
3190     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3191    
3192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3193     $self->{line_prev} = $self->{line};
3194     $self->{column_prev} = $self->{column};
3195     $self->{column}++;
3196     $self->{nc}
3197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3198     } else {
3199     $self->{set_nc}->($self);
3200     }
3201    
3202     redo A;
3203     } elsif ($self->{nc} == 0x0027) { # '
3204    
3205     $self->{ct}->{sysid} = ''; # DOCTYPE
3206     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3207    
3208     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209     $self->{line_prev} = $self->{line};
3210     $self->{column_prev} = $self->{column};
3211     $self->{column}++;
3212     $self->{nc}
3213     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214     } else {
3215     $self->{set_nc}->($self);
3216     }
3217    
3218     redo A;
3219     } elsif ($self->{nc} == 0x003E) { # >
3220    
3221     $self->{state} = DATA_STATE;
3222 wakaba 1.5 $self->{s_kwd} = '';
3223 wakaba 1.1
3224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3225     $self->{line_prev} = $self->{line};
3226     $self->{column_prev} = $self->{column};
3227     $self->{column}++;
3228     $self->{nc}
3229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3230     } else {
3231     $self->{set_nc}->($self);
3232     }
3233    
3234    
3235     return ($self->{ct}); # DOCTYPE
3236    
3237     redo A;
3238     } elsif ($self->{nc} == -1) {
3239    
3240     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3241    
3242     $self->{state} = DATA_STATE;
3243 wakaba 1.5 $self->{s_kwd} = '';
3244 wakaba 1.1 ## reconsume
3245    
3246     $self->{ct}->{quirks} = 1;
3247     return ($self->{ct}); # DOCTYPE
3248    
3249     redo A;
3250     } else {
3251    
3252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3253     $self->{ct}->{quirks} = 1;
3254    
3255     $self->{state} = BOGUS_DOCTYPE_STATE;
3256    
3257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3258     $self->{line_prev} = $self->{line};
3259     $self->{column_prev} = $self->{column};
3260     $self->{column}++;
3261     $self->{nc}
3262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3263     } else {
3264     $self->{set_nc}->($self);
3265     }
3266    
3267     redo A;
3268     }
3269     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3270     if ($is_space->{$self->{nc}}) {
3271    
3272     ## Stay in the state
3273    
3274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275     $self->{line_prev} = $self->{line};
3276     $self->{column_prev} = $self->{column};
3277     $self->{column}++;
3278     $self->{nc}
3279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280     } else {
3281     $self->{set_nc}->($self);
3282     }
3283    
3284     redo A;
3285     } elsif ($self->{nc} == 0x0022) { # "
3286    
3287     $self->{ct}->{sysid} = ''; # DOCTYPE
3288     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3289    
3290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3291     $self->{line_prev} = $self->{line};
3292     $self->{column_prev} = $self->{column};
3293     $self->{column}++;
3294     $self->{nc}
3295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3296     } else {
3297     $self->{set_nc}->($self);
3298     }
3299    
3300     redo A;
3301     } elsif ($self->{nc} == 0x0027) { # '
3302    
3303     $self->{ct}->{sysid} = ''; # DOCTYPE
3304     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3305    
3306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3307     $self->{line_prev} = $self->{line};
3308     $self->{column_prev} = $self->{column};
3309     $self->{column}++;
3310     $self->{nc}
3311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3312     } else {
3313     $self->{set_nc}->($self);
3314     }
3315    
3316     redo A;
3317     } elsif ($self->{nc} == 0x003E) { # >
3318    
3319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3320     $self->{state} = DATA_STATE;
3321 wakaba 1.5 $self->{s_kwd} = '';
3322 wakaba 1.1
3323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3324     $self->{line_prev} = $self->{line};
3325     $self->{column_prev} = $self->{column};
3326     $self->{column}++;
3327     $self->{nc}
3328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3329     } else {
3330     $self->{set_nc}->($self);
3331     }
3332    
3333    
3334     $self->{ct}->{quirks} = 1;
3335     return ($self->{ct}); # DOCTYPE
3336    
3337     redo A;
3338     } elsif ($self->{nc} == -1) {
3339    
3340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3341    
3342     $self->{state} = DATA_STATE;
3343 wakaba 1.5 $self->{s_kwd} = '';
3344 wakaba 1.1 ## reconsume
3345    
3346     $self->{ct}->{quirks} = 1;
3347     return ($self->{ct}); # DOCTYPE
3348    
3349     redo A;
3350     } else {
3351    
3352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3353     $self->{ct}->{quirks} = 1;
3354    
3355     $self->{state} = BOGUS_DOCTYPE_STATE;
3356    
3357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3358     $self->{line_prev} = $self->{line};
3359     $self->{column_prev} = $self->{column};
3360     $self->{column}++;
3361     $self->{nc}
3362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3363     } else {
3364     $self->{set_nc}->($self);
3365     }
3366    
3367     redo A;
3368     }
3369     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3370     if ($self->{nc} == 0x0022) { # "
3371    
3372     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3373    
3374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3375     $self->{line_prev} = $self->{line};
3376     $self->{column_prev} = $self->{column};
3377     $self->{column}++;
3378     $self->{nc}
3379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3380     } else {
3381     $self->{set_nc}->($self);
3382     }
3383    
3384     redo A;
3385     } elsif ($self->{nc} == 0x003E) { # >
3386    
3387     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3388    
3389     $self->{state} = DATA_STATE;
3390 wakaba 1.5 $self->{s_kwd} = '';
3391 wakaba 1.1
3392     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3393     $self->{line_prev} = $self->{line};
3394     $self->{column_prev} = $self->{column};
3395     $self->{column}++;
3396     $self->{nc}
3397     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3398     } else {
3399     $self->{set_nc}->($self);
3400     }
3401    
3402    
3403     $self->{ct}->{quirks} = 1;
3404     return ($self->{ct}); # DOCTYPE
3405    
3406     redo A;
3407     } elsif ($self->{nc} == -1) {
3408    
3409     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3410    
3411     $self->{state} = DATA_STATE;
3412 wakaba 1.5 $self->{s_kwd} = '';
3413 wakaba 1.1 ## reconsume
3414    
3415     $self->{ct}->{quirks} = 1;
3416     return ($self->{ct}); # DOCTYPE
3417    
3418     redo A;
3419     } else {
3420    
3421     $self->{ct}->{sysid} # DOCTYPE
3422     .= chr $self->{nc};
3423     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3424     length $self->{ct}->{sysid});
3425    
3426     ## Stay in the state
3427    
3428     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3429     $self->{line_prev} = $self->{line};
3430     $self->{column_prev} = $self->{column};
3431     $self->{column}++;
3432     $self->{nc}
3433     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3434     } else {
3435     $self->{set_nc}->($self);
3436     }
3437    
3438     redo A;
3439     }
3440     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3441     if ($self->{nc} == 0x0027) { # '
3442    
3443     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3444    
3445     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3446     $self->{line_prev} = $self->{line};
3447     $self->{column_prev} = $self->{column};
3448     $self->{column}++;
3449     $self->{nc}
3450     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3451     } else {
3452     $self->{set_nc}->($self);
3453     }
3454    
3455     redo A;
3456     } elsif ($self->{nc} == 0x003E) { # >
3457    
3458     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3459    
3460     $self->{state} = DATA_STATE;
3461 wakaba 1.5 $self->{s_kwd} = '';
3462 wakaba 1.1
3463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3464     $self->{line_prev} = $self->{line};
3465     $self->{column_prev} = $self->{column};
3466     $self->{column}++;
3467     $self->{nc}
3468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3469     } else {
3470     $self->{set_nc}->($self);
3471     }
3472    
3473    
3474     $self->{ct}->{quirks} = 1;
3475     return ($self->{ct}); # DOCTYPE
3476    
3477     redo A;
3478     } elsif ($self->{nc} == -1) {
3479    
3480     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3481    
3482     $self->{state} = DATA_STATE;
3483 wakaba 1.5 $self->{s_kwd} = '';
3484 wakaba 1.1 ## reconsume
3485    
3486     $self->{ct}->{quirks} = 1;
3487     return ($self->{ct}); # DOCTYPE
3488    
3489     redo A;
3490     } else {
3491    
3492     $self->{ct}->{sysid} # DOCTYPE
3493     .= chr $self->{nc};
3494     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3495     length $self->{ct}->{sysid});
3496    
3497     ## Stay in the state
3498    
3499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3500     $self->{line_prev} = $self->{line};
3501     $self->{column_prev} = $self->{column};
3502     $self->{column}++;
3503     $self->{nc}
3504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3505     } else {
3506     $self->{set_nc}->($self);
3507     }
3508    
3509     redo A;
3510     }
3511     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3512     if ($is_space->{$self->{nc}}) {
3513    
3514     ## Stay in the state
3515    
3516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3517     $self->{line_prev} = $self->{line};
3518     $self->{column_prev} = $self->{column};
3519     $self->{column}++;
3520     $self->{nc}
3521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3522     } else {
3523     $self->{set_nc}->($self);
3524     }
3525    
3526     redo A;
3527     } elsif ($self->{nc} == 0x003E) { # >
3528    
3529     $self->{state} = DATA_STATE;
3530 wakaba 1.5 $self->{s_kwd} = '';
3531 wakaba 1.1
3532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3533     $self->{line_prev} = $self->{line};
3534     $self->{column_prev} = $self->{column};
3535     $self->{column}++;
3536     $self->{nc}
3537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3538     } else {
3539     $self->{set_nc}->($self);
3540     }
3541    
3542    
3543     return ($self->{ct}); # DOCTYPE
3544    
3545     redo A;
3546     } elsif ($self->{nc} == -1) {
3547    
3548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3549     $self->{state} = DATA_STATE;
3550 wakaba 1.5 $self->{s_kwd} = '';
3551 wakaba 1.1 ## reconsume
3552    
3553     $self->{ct}->{quirks} = 1;
3554     return ($self->{ct}); # DOCTYPE
3555    
3556     redo A;
3557     } else {
3558    
3559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3560     #$self->{ct}->{quirks} = 1;
3561    
3562     $self->{state} = BOGUS_DOCTYPE_STATE;
3563    
3564     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3565     $self->{line_prev} = $self->{line};
3566     $self->{column_prev} = $self->{column};
3567     $self->{column}++;
3568     $self->{nc}
3569     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3570     } else {
3571     $self->{set_nc}->($self);
3572     }
3573    
3574     redo A;
3575     }
3576     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3577     if ($self->{nc} == 0x003E) { # >
3578    
3579     $self->{state} = DATA_STATE;
3580 wakaba 1.5 $self->{s_kwd} = '';
3581 wakaba 1.1
3582     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3583     $self->{line_prev} = $self->{line};
3584     $self->{column_prev} = $self->{column};
3585     $self->{column}++;
3586     $self->{nc}
3587     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3588     } else {
3589     $self->{set_nc}->($self);
3590     }
3591    
3592    
3593     return ($self->{ct}); # DOCTYPE
3594    
3595     redo A;
3596     } elsif ($self->{nc} == -1) {
3597    
3598     $self->{state} = DATA_STATE;
3599 wakaba 1.5 $self->{s_kwd} = '';
3600 wakaba 1.1 ## reconsume
3601    
3602     return ($self->{ct}); # DOCTYPE
3603    
3604     redo A;
3605     } else {
3606    
3607     my $s = '';
3608     $self->{read_until}->($s, q[>], 0);
3609    
3610     ## Stay in the state
3611    
3612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3613     $self->{line_prev} = $self->{line};
3614     $self->{column_prev} = $self->{column};
3615     $self->{column}++;
3616     $self->{nc}
3617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3618     } else {
3619     $self->{set_nc}->($self);
3620     }
3621    
3622     redo A;
3623     }
3624     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3625     ## NOTE: "CDATA section state" in the state is jointly implemented
3626     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3627     ## and |CDATA_SECTION_MSE2_STATE|.
3628    
3629     if ($self->{nc} == 0x005D) { # ]
3630    
3631     $self->{state} = CDATA_SECTION_MSE1_STATE;
3632    
3633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634     $self->{line_prev} = $self->{line};
3635     $self->{column_prev} = $self->{column};
3636     $self->{column}++;
3637     $self->{nc}
3638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639     } else {
3640     $self->{set_nc}->($self);
3641     }
3642    
3643     redo A;
3644     } elsif ($self->{nc} == -1) {
3645 wakaba 1.6 if ($self->{is_xml}) {
3646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3647     }
3648    
3649 wakaba 1.1 $self->{state} = DATA_STATE;
3650 wakaba 1.5 $self->{s_kwd} = '';
3651 wakaba 1.1
3652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3653     $self->{line_prev} = $self->{line};
3654     $self->{column_prev} = $self->{column};
3655     $self->{column}++;
3656     $self->{nc}
3657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3658     } else {
3659     $self->{set_nc}->($self);
3660     }
3661    
3662     if (length $self->{ct}->{data}) { # character
3663    
3664     return ($self->{ct}); # character
3665     } else {
3666    
3667     ## No token to emit. $self->{ct} is discarded.
3668     }
3669     redo A;
3670     } else {
3671    
3672     $self->{ct}->{data} .= chr $self->{nc};
3673     $self->{read_until}->($self->{ct}->{data},
3674     q<]>,
3675     length $self->{ct}->{data});
3676    
3677     ## Stay in the state.
3678    
3679     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3680     $self->{line_prev} = $self->{line};
3681     $self->{column_prev} = $self->{column};
3682     $self->{column}++;
3683     $self->{nc}
3684     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3685     } else {
3686     $self->{set_nc}->($self);
3687     }
3688    
3689     redo A;
3690     }
3691    
3692     ## ISSUE: "text tokens" in spec.
3693     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3694     if ($self->{nc} == 0x005D) { # ]
3695    
3696     $self->{state} = CDATA_SECTION_MSE2_STATE;
3697    
3698     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3699     $self->{line_prev} = $self->{line};
3700     $self->{column_prev} = $self->{column};
3701     $self->{column}++;
3702     $self->{nc}
3703     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3704     } else {
3705     $self->{set_nc}->($self);
3706     }
3707    
3708     redo A;
3709     } else {
3710    
3711     $self->{ct}->{data} .= ']';
3712     $self->{state} = CDATA_SECTION_STATE;
3713     ## Reconsume.
3714     redo A;
3715     }
3716     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3717     if ($self->{nc} == 0x003E) { # >
3718     $self->{state} = DATA_STATE;
3719 wakaba 1.5 $self->{s_kwd} = '';
3720 wakaba 1.1
3721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3722     $self->{line_prev} = $self->{line};
3723     $self->{column_prev} = $self->{column};
3724     $self->{column}++;
3725     $self->{nc}
3726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3727     } else {
3728     $self->{set_nc}->($self);
3729     }
3730    
3731     if (length $self->{ct}->{data}) { # character
3732    
3733     return ($self->{ct}); # character
3734     } else {
3735    
3736     ## No token to emit. $self->{ct} is discarded.
3737     }
3738     redo A;
3739     } elsif ($self->{nc} == 0x005D) { # ]
3740     # character
3741     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3742     ## Stay in the state.
3743    
3744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745     $self->{line_prev} = $self->{line};
3746     $self->{column_prev} = $self->{column};
3747     $self->{column}++;
3748     $self->{nc}
3749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750     } else {
3751     $self->{set_nc}->($self);
3752     }
3753    
3754     redo A;
3755     } else {
3756    
3757     $self->{ct}->{data} .= ']]'; # character
3758     $self->{state} = CDATA_SECTION_STATE;
3759     ## Reconsume.
3760     redo A;
3761     }
3762     } elsif ($self->{state} == ENTITY_STATE) {
3763     if ($is_space->{$self->{nc}} or
3764     {
3765     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3766     $self->{entity_add} => 1,
3767     }->{$self->{nc}}) {
3768    
3769     ## Don't consume
3770     ## No error
3771     ## Return nothing.
3772     #
3773     } elsif ($self->{nc} == 0x0023) { # #
3774    
3775     $self->{state} = ENTITY_HASH_STATE;
3776     $self->{s_kwd} = '#';
3777    
3778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779     $self->{line_prev} = $self->{line};
3780     $self->{column_prev} = $self->{column};
3781     $self->{column}++;
3782     $self->{nc}
3783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3784     } else {
3785     $self->{set_nc}->($self);
3786     }
3787    
3788     redo A;
3789     } elsif ((0x0041 <= $self->{nc} and
3790     $self->{nc} <= 0x005A) or # A..Z
3791     (0x0061 <= $self->{nc} and
3792     $self->{nc} <= 0x007A)) { # a..z
3793    
3794     require Whatpm::_NamedEntityList;
3795     $self->{state} = ENTITY_NAME_STATE;
3796     $self->{s_kwd} = chr $self->{nc};
3797     $self->{entity__value} = $self->{s_kwd};
3798     $self->{entity__match} = 0;
3799    
3800     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3801     $self->{line_prev} = $self->{line};
3802     $self->{column_prev} = $self->{column};
3803     $self->{column}++;
3804     $self->{nc}
3805     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3806     } else {
3807     $self->{set_nc}->($self);
3808     }
3809    
3810     redo A;
3811     } else {
3812    
3813     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3814     ## Return nothing.
3815     #
3816     }
3817    
3818     ## NOTE: No character is consumed by the "consume a character
3819     ## reference" algorithm. In other word, there is an "&" character
3820     ## that does not introduce a character reference, which would be
3821     ## appended to the parent element or the attribute value in later
3822     ## process of the tokenizer.
3823    
3824     if ($self->{prev_state} == DATA_STATE) {
3825    
3826     $self->{state} = $self->{prev_state};
3827 wakaba 1.5 $self->{s_kwd} = '';
3828 wakaba 1.1 ## Reconsume.
3829     return ({type => CHARACTER_TOKEN, data => '&',
3830     line => $self->{line_prev},
3831     column => $self->{column_prev},
3832     });
3833     redo A;
3834     } else {
3835    
3836     $self->{ca}->{value} .= '&';
3837     $self->{state} = $self->{prev_state};
3838 wakaba 1.5 $self->{s_kwd} = '';
3839 wakaba 1.1 ## Reconsume.
3840     redo A;
3841     }
3842     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3843     if ($self->{nc} == 0x0078 or # x
3844     $self->{nc} == 0x0058) { # X
3845    
3846     $self->{state} = HEXREF_X_STATE;
3847     $self->{s_kwd} .= chr $self->{nc};
3848    
3849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3850     $self->{line_prev} = $self->{line};
3851     $self->{column_prev} = $self->{column};
3852     $self->{column}++;
3853     $self->{nc}
3854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3855     } else {
3856     $self->{set_nc}->($self);
3857     }
3858    
3859     redo A;
3860     } elsif (0x0030 <= $self->{nc} and
3861     $self->{nc} <= 0x0039) { # 0..9
3862    
3863     $self->{state} = NCR_NUM_STATE;
3864     $self->{s_kwd} = $self->{nc} - 0x0030;
3865    
3866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3867     $self->{line_prev} = $self->{line};
3868     $self->{column_prev} = $self->{column};
3869     $self->{column}++;
3870     $self->{nc}
3871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3872     } else {
3873     $self->{set_nc}->($self);
3874     }
3875    
3876     redo A;
3877     } else {
3878     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3879     line => $self->{line_prev},
3880     column => $self->{column_prev} - 1);
3881    
3882     ## NOTE: According to the spec algorithm, nothing is returned,
3883     ## and then "&#" is appended to the parent element or the attribute
3884     ## value in the later processing.
3885    
3886     if ($self->{prev_state} == DATA_STATE) {
3887    
3888     $self->{state} = $self->{prev_state};
3889 wakaba 1.5 $self->{s_kwd} = '';
3890 wakaba 1.1 ## Reconsume.
3891     return ({type => CHARACTER_TOKEN,
3892     data => '&#',
3893     line => $self->{line_prev},
3894     column => $self->{column_prev} - 1,
3895     });
3896     redo A;
3897     } else {
3898    
3899     $self->{ca}->{value} .= '&#';
3900     $self->{state} = $self->{prev_state};
3901 wakaba 1.5 $self->{s_kwd} = '';
3902 wakaba 1.1 ## Reconsume.
3903     redo A;
3904     }
3905     }
3906     } elsif ($self->{state} == NCR_NUM_STATE) {
3907     if (0x0030 <= $self->{nc} and
3908     $self->{nc} <= 0x0039) { # 0..9
3909    
3910     $self->{s_kwd} *= 10;
3911     $self->{s_kwd} += $self->{nc} - 0x0030;
3912    
3913     ## Stay in the state.
3914    
3915     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3916     $self->{line_prev} = $self->{line};
3917     $self->{column_prev} = $self->{column};
3918     $self->{column}++;
3919     $self->{nc}
3920     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3921     } else {
3922     $self->{set_nc}->($self);
3923     }
3924    
3925     redo A;
3926     } elsif ($self->{nc} == 0x003B) { # ;
3927    
3928    
3929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3930     $self->{line_prev} = $self->{line};
3931     $self->{column_prev} = $self->{column};
3932     $self->{column}++;
3933     $self->{nc}
3934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3935     } else {
3936     $self->{set_nc}->($self);
3937     }
3938    
3939     #
3940     } else {
3941    
3942     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3943     ## Reconsume.
3944     #
3945     }
3946    
3947     my $code = $self->{s_kwd};
3948     my $l = $self->{line_prev};
3949     my $c = $self->{column_prev};
3950     if ($charref_map->{$code}) {
3951    
3952     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3953     text => (sprintf 'U+%04X', $code),
3954     line => $l, column => $c);
3955     $code = $charref_map->{$code};
3956     } elsif ($code > 0x10FFFF) {
3957    
3958     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3959     text => (sprintf 'U-%08X', $code),
3960     line => $l, column => $c);
3961     $code = 0xFFFD;
3962     }
3963    
3964     if ($self->{prev_state} == DATA_STATE) {
3965    
3966     $self->{state} = $self->{prev_state};
3967 wakaba 1.5 $self->{s_kwd} = '';
3968 wakaba 1.1 ## Reconsume.
3969     return ({type => CHARACTER_TOKEN, data => chr $code,
3970 wakaba 1.7 has_reference => 1,
3971 wakaba 1.1 line => $l, column => $c,
3972     });
3973     redo A;
3974     } else {
3975    
3976     $self->{ca}->{value} .= chr $code;
3977     $self->{ca}->{has_reference} = 1;
3978     $self->{state} = $self->{prev_state};
3979 wakaba 1.5 $self->{s_kwd} = '';
3980 wakaba 1.1 ## Reconsume.
3981     redo A;
3982     }
3983     } elsif ($self->{state} == HEXREF_X_STATE) {
3984     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3985     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3986     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3987     # 0..9, A..F, a..f
3988    
3989     $self->{state} = HEXREF_HEX_STATE;
3990     $self->{s_kwd} = 0;
3991     ## Reconsume.
3992     redo A;
3993     } else {
3994     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3995     line => $self->{line_prev},
3996     column => $self->{column_prev} - 2);
3997    
3998     ## NOTE: According to the spec algorithm, nothing is returned,
3999     ## and then "&#" followed by "X" or "x" is appended to the parent
4000     ## element or the attribute value in the later processing.
4001    
4002     if ($self->{prev_state} == DATA_STATE) {
4003    
4004     $self->{state} = $self->{prev_state};
4005 wakaba 1.5 $self->{s_kwd} = '';
4006 wakaba 1.1 ## Reconsume.
4007     return ({type => CHARACTER_TOKEN,
4008     data => '&' . $self->{s_kwd},
4009     line => $self->{line_prev},
4010     column => $self->{column_prev} - length $self->{s_kwd},
4011     });
4012     redo A;
4013     } else {
4014    
4015     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4016     $self->{state} = $self->{prev_state};
4017 wakaba 1.5 $self->{s_kwd} = '';
4018 wakaba 1.1 ## Reconsume.
4019     redo A;
4020     }
4021     }
4022     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4023     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4024     # 0..9
4025    
4026     $self->{s_kwd} *= 0x10;
4027     $self->{s_kwd} += $self->{nc} - 0x0030;
4028     ## Stay in the state.
4029    
4030     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4031     $self->{line_prev} = $self->{line};
4032     $self->{column_prev} = $self->{column};
4033     $self->{column}++;
4034     $self->{nc}
4035     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4036     } else {
4037     $self->{set_nc}->($self);
4038     }
4039    
4040     redo A;
4041     } elsif (0x0061 <= $self->{nc} and
4042     $self->{nc} <= 0x0066) { # a..f
4043    
4044     $self->{s_kwd} *= 0x10;
4045     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4046     ## Stay in the state.
4047    
4048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4049     $self->{line_prev} = $self->{line};
4050     $self->{column_prev} = $self->{column};
4051     $self->{column}++;
4052     $self->{nc}
4053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4054     } else {
4055     $self->{set_nc}->($self);
4056     }
4057    
4058     redo A;
4059     } elsif (0x0041 <= $self->{nc} and
4060     $self->{nc} <= 0x0046) { # A..F
4061    
4062     $self->{s_kwd} *= 0x10;
4063     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4064     ## Stay in the state.
4065    
4066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4067     $self->{line_prev} = $self->{line};
4068     $self->{column_prev} = $self->{column};
4069     $self->{column}++;
4070     $self->{nc}
4071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4072     } else {
4073     $self->{set_nc}->($self);
4074     }
4075    
4076     redo A;
4077     } elsif ($self->{nc} == 0x003B) { # ;
4078    
4079    
4080     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4081     $self->{line_prev} = $self->{line};
4082     $self->{column_prev} = $self->{column};
4083     $self->{column}++;
4084     $self->{nc}
4085     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4086     } else {
4087     $self->{set_nc}->($self);
4088     }
4089    
4090     #
4091     } else {
4092    
4093     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4094     line => $self->{line},
4095     column => $self->{column});
4096     ## Reconsume.
4097     #
4098     }
4099    
4100     my $code = $self->{s_kwd};
4101     my $l = $self->{line_prev};
4102     my $c = $self->{column_prev};
4103     if ($charref_map->{$code}) {
4104    
4105     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4106     text => (sprintf 'U+%04X', $code),
4107     line => $l, column => $c);
4108     $code = $charref_map->{$code};
4109     } elsif ($code > 0x10FFFF) {
4110    
4111     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4112     text => (sprintf 'U-%08X', $code),
4113     line => $l, column => $c);
4114     $code = 0xFFFD;
4115     }
4116    
4117     if ($self->{prev_state} == DATA_STATE) {
4118    
4119     $self->{state} = $self->{prev_state};
4120 wakaba 1.5 $self->{s_kwd} = '';
4121 wakaba 1.1 ## Reconsume.
4122     return ({type => CHARACTER_TOKEN, data => chr $code,
4123 wakaba 1.7 has_reference => 1,
4124 wakaba 1.1 line => $l, column => $c,
4125     });
4126     redo A;
4127     } else {
4128    
4129     $self->{ca}->{value} .= chr $code;
4130     $self->{ca}->{has_reference} = 1;
4131     $self->{state} = $self->{prev_state};
4132 wakaba 1.5 $self->{s_kwd} = '';
4133 wakaba 1.1 ## Reconsume.
4134     redo A;
4135     }
4136     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4137     if (length $self->{s_kwd} < 30 and
4138     ## NOTE: Some number greater than the maximum length of entity name
4139     ((0x0041 <= $self->{nc} and # a
4140     $self->{nc} <= 0x005A) or # x
4141     (0x0061 <= $self->{nc} and # a
4142     $self->{nc} <= 0x007A) or # z
4143     (0x0030 <= $self->{nc} and # 0
4144     $self->{nc} <= 0x0039) or # 9
4145     $self->{nc} == 0x003B)) { # ;
4146     our $EntityChar;
4147     $self->{s_kwd} .= chr $self->{nc};
4148     if (defined $EntityChar->{$self->{s_kwd}}) {
4149     if ($self->{nc} == 0x003B) { # ;
4150    
4151     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4152     $self->{entity__match} = 1;
4153    
4154     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4155     $self->{line_prev} = $self->{line};
4156     $self->{column_prev} = $self->{column};
4157     $self->{column}++;
4158     $self->{nc}
4159     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4160     } else {
4161     $self->{set_nc}->($self);
4162     }
4163    
4164     #
4165     } else {
4166    
4167     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4168     $self->{entity__match} = -1;
4169     ## Stay in the state.
4170    
4171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4172     $self->{line_prev} = $self->{line};
4173     $self->{column_prev} = $self->{column};
4174     $self->{column}++;
4175     $self->{nc}
4176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4177     } else {
4178     $self->{set_nc}->($self);
4179     }
4180    
4181     redo A;
4182     }
4183     } else {
4184    
4185     $self->{entity__value} .= chr $self->{nc};
4186     $self->{entity__match} *= 2;
4187     ## Stay in the state.
4188    
4189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190     $self->{line_prev} = $self->{line};
4191     $self->{column_prev} = $self->{column};
4192     $self->{column}++;
4193     $self->{nc}
4194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195     } else {
4196     $self->{set_nc}->($self);
4197     }
4198    
4199     redo A;
4200     }
4201     }
4202    
4203     my $data;
4204     my $has_ref;
4205     if ($self->{entity__match} > 0) {
4206    
4207     $data = $self->{entity__value};
4208     $has_ref = 1;
4209     #
4210     } elsif ($self->{entity__match} < 0) {
4211     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4212     if ($self->{prev_state} != DATA_STATE and # in attribute
4213     $self->{entity__match} < -1) {
4214    
4215     $data = '&' . $self->{s_kwd};
4216     #
4217     } else {
4218    
4219     $data = $self->{entity__value};
4220     $has_ref = 1;
4221     #
4222     }
4223     } else {
4224    
4225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4226     line => $self->{line_prev},
4227     column => $self->{column_prev} - length $self->{s_kwd});
4228     $data = '&' . $self->{s_kwd};
4229     #
4230     }
4231    
4232     ## NOTE: In these cases, when a character reference is found,
4233     ## it is consumed and a character token is returned, or, otherwise,
4234     ## nothing is consumed and returned, according to the spec algorithm.
4235     ## In this implementation, anything that has been examined by the
4236     ## tokenizer is appended to the parent element or the attribute value
4237     ## as string, either literal string when no character reference or
4238     ## entity-replaced string otherwise, in this stage, since any characters
4239     ## that would not be consumed are appended in the data state or in an
4240     ## appropriate attribute value state anyway.
4241    
4242     if ($self->{prev_state} == DATA_STATE) {
4243    
4244     $self->{state} = $self->{prev_state};
4245 wakaba 1.5 $self->{s_kwd} = '';
4246 wakaba 1.1 ## Reconsume.
4247     return ({type => CHARACTER_TOKEN,
4248     data => $data,
4249 wakaba 1.7 has_reference => $has_ref,
4250 wakaba 1.1 line => $self->{line_prev},
4251     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4252     });
4253     redo A;
4254     } else {
4255    
4256     $self->{ca}->{value} .= $data;
4257     $self->{ca}->{has_reference} = 1 if $has_ref;
4258     $self->{state} = $self->{prev_state};
4259 wakaba 1.5 $self->{s_kwd} = '';
4260 wakaba 1.1 ## Reconsume.
4261     redo A;
4262     }
4263     } else {
4264     die "$0: $self->{state}: Unknown state";
4265     }
4266     } # A
4267    
4268     die "$0: _get_next_token: unexpected case";
4269     } # _get_next_token
4270    
4271     1;
4272 wakaba 1.7 ## $Date: 2008/10/14 14:57:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24