/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (hide annotations) (download)
Sun Oct 19 06:14:57 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.17: +406 -6 lines
++ whatpm/t/ChangeLog	19 Oct 2008 06:14:42 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/entities-1.dat" and "xml/entities-2.dat"
	added.  Support for the "#entities" directive.

++ whatpm/t/xml/ChangeLog	19 Oct 2008 06:11:59 -0000
	* entities-1.dat, entities-2.dat: New test data files.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 06:12:27 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (notation_name): New attribute.

	* NanoDOM.pm (public_id, system_id): New attributes.a
++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 06:13:03 -0000
	* Dumper.pm: Dump text content of Entity nodes.

	* Tokenizer.pm.src: Support for <!ENTITY ... NDATA>.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 06:14:05 -0000
2008-10-19  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): General and parameter entities
	implemented.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.18 our $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185     sub AFTER_NOTATION_NAME_STATE () { 90 }
186     sub BOGUS_MD_STATE () { 91 }
187 wakaba 1.8
188 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
189     ## list and descriptions)
190    
191     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
192     sub FOREIGN_EL () { 0b1_00000000000 }
193    
194     ## Character reference mappings
195    
196     my $charref_map = {
197     0x0D => 0x000A,
198     0x80 => 0x20AC,
199     0x81 => 0xFFFD,
200     0x82 => 0x201A,
201     0x83 => 0x0192,
202     0x84 => 0x201E,
203     0x85 => 0x2026,
204     0x86 => 0x2020,
205     0x87 => 0x2021,
206     0x88 => 0x02C6,
207     0x89 => 0x2030,
208     0x8A => 0x0160,
209     0x8B => 0x2039,
210     0x8C => 0x0152,
211     0x8D => 0xFFFD,
212     0x8E => 0x017D,
213     0x8F => 0xFFFD,
214     0x90 => 0xFFFD,
215     0x91 => 0x2018,
216     0x92 => 0x2019,
217     0x93 => 0x201C,
218     0x94 => 0x201D,
219     0x95 => 0x2022,
220     0x96 => 0x2013,
221     0x97 => 0x2014,
222     0x98 => 0x02DC,
223     0x99 => 0x2122,
224     0x9A => 0x0161,
225     0x9B => 0x203A,
226     0x9C => 0x0153,
227     0x9D => 0xFFFD,
228     0x9E => 0x017E,
229     0x9F => 0x0178,
230     }; # $charref_map
231     $charref_map->{$_} = 0xFFFD
232     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
233     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
234     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
235     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
236     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
237     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
238     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
239    
240     ## Implementations MUST act as if state machine in the spec
241    
242     sub _initialize_tokenizer ($) {
243     my $self = shift;
244    
245     ## NOTE: Fields set by |new| constructor:
246     #$self->{level}
247     #$self->{set_nc}
248     #$self->{parse_error}
249 wakaba 1.3 #$self->{is_xml} (if XML)
250 wakaba 1.1
251     $self->{state} = DATA_STATE; # MUST
252 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
253     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
254 wakaba 1.1 #$self->{entity__value}; # initialized when used
255     #$self->{entity__match}; # initialized when used
256     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
257     undef $self->{ct}; # current token
258     undef $self->{ca}; # current attribute
259     undef $self->{last_stag_name}; # last emitted start tag name
260     #$self->{prev_state}; # initialized when used
261     delete $self->{self_closing};
262     $self->{char_buffer} = '';
263     $self->{char_buffer_pos} = 0;
264     $self->{nc} = -1; # next input character
265     #$self->{next_nc}
266    
267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
268     $self->{line_prev} = $self->{line};
269     $self->{column_prev} = $self->{column};
270     $self->{column}++;
271     $self->{nc}
272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
273     } else {
274     $self->{set_nc}->($self);
275     }
276    
277     $self->{token} = [];
278     # $self->{escape}
279     } # _initialize_tokenizer
280    
281     ## A token has:
282     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
285     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286 wakaba 1.11 ## ->{target} (PI_TOKEN)
287 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
288     ## ->{sysid} (DOCTYPE_TOKEN)
289     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
290     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
291     ## ->{name}
292     ## ->{value}
293     ## ->{has_reference} == 1 or 0
294 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
295     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
302     ## while the token is pushed back to the stack.
303    
304     ## Emitted token MUST immediately be handled by the tree construction state.
305    
306     ## Before each step, UA MAY check to see if either one of the scripts in
307     ## "list of scripts that will execute as soon as possible" or the first
308     ## script in the "list of scripts that will execute asynchronously",
309     ## has completed loading. If one has, then it MUST be executed
310     ## and removed from the list.
311    
312     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
313     ## (This requirement was dropped from HTML5 spec, unfortunately.)
314    
315     my $is_space = {
316     0x0009 => 1, # CHARACTER TABULATION (HT)
317     0x000A => 1, # LINE FEED (LF)
318     #0x000B => 0, # LINE TABULATION (VT)
319 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
321     0x0020 => 1, # SPACE (SP)
322     };
323    
324     sub _get_next_token ($) {
325     my $self = shift;
326    
327     if ($self->{self_closing}) {
328     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
329     ## NOTE: The |self_closing| flag is only set by start tag token.
330     ## In addition, when a start tag token is emitted, it is always set to
331     ## |ct|.
332     delete $self->{self_closing};
333     }
334    
335     if (@{$self->{token}}) {
336     $self->{self_closing} = $self->{token}->[0]->{self_closing};
337     return shift @{$self->{token}};
338     }
339    
340     A: {
341     if ($self->{state} == PCDATA_STATE) {
342     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
343    
344     if ($self->{nc} == 0x0026) { # &
345    
346     ## NOTE: In the spec, the tokenizer is switched to the
347     ## "entity data state". In this implementation, the tokenizer
348     ## is switched to the |ENTITY_STATE|, which is an implementation
349     ## of the "consume a character reference" algorithm.
350     $self->{entity_add} = -1;
351     $self->{prev_state} = DATA_STATE;
352     $self->{state} = ENTITY_STATE;
353    
354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
355     $self->{line_prev} = $self->{line};
356     $self->{column_prev} = $self->{column};
357     $self->{column}++;
358     $self->{nc}
359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
360     } else {
361     $self->{set_nc}->($self);
362     }
363    
364     redo A;
365     } elsif ($self->{nc} == 0x003C) { # <
366    
367     $self->{state} = TAG_OPEN_STATE;
368    
369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
370     $self->{line_prev} = $self->{line};
371     $self->{column_prev} = $self->{column};
372     $self->{column}++;
373     $self->{nc}
374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
375     } else {
376     $self->{set_nc}->($self);
377     }
378    
379     redo A;
380     } elsif ($self->{nc} == -1) {
381    
382     return ({type => END_OF_FILE_TOKEN,
383     line => $self->{line}, column => $self->{column}});
384     last A; ## TODO: ok?
385     } else {
386    
387     #
388     }
389    
390     # Anything else
391     my $token = {type => CHARACTER_TOKEN,
392     data => chr $self->{nc},
393     line => $self->{line}, column => $self->{column},
394     };
395     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
396    
397     ## Stay in the state.
398    
399     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
400     $self->{line_prev} = $self->{line};
401     $self->{column_prev} = $self->{column};
402     $self->{column}++;
403     $self->{nc}
404     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
405     } else {
406     $self->{set_nc}->($self);
407     }
408    
409     return ($token);
410     redo A;
411     } elsif ($self->{state} == DATA_STATE) {
412     $self->{s_kwd} = '' unless defined $self->{s_kwd};
413     if ($self->{nc} == 0x0026) { # &
414     $self->{s_kwd} = '';
415     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
416     not $self->{escape}) {
417    
418     ## NOTE: In the spec, the tokenizer is switched to the
419     ## "entity data state". In this implementation, the tokenizer
420     ## is switched to the |ENTITY_STATE|, which is an implementation
421     ## of the "consume a character reference" algorithm.
422     $self->{entity_add} = -1;
423     $self->{prev_state} = DATA_STATE;
424     $self->{state} = ENTITY_STATE;
425    
426     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
427     $self->{line_prev} = $self->{line};
428     $self->{column_prev} = $self->{column};
429     $self->{column}++;
430     $self->{nc}
431     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
432     } else {
433     $self->{set_nc}->($self);
434     }
435    
436     redo A;
437     } else {
438    
439     #
440     }
441     } elsif ($self->{nc} == 0x002D) { # -
442     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
443 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
444 wakaba 1.1
445     $self->{escape} = 1; # unless $self->{escape};
446     $self->{s_kwd} = '--';
447     #
448 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
449 wakaba 1.1
450     $self->{s_kwd} = '--';
451     #
452 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
453    
454     $self->{s_kwd} .= '-';
455     #
456 wakaba 1.1 } else {
457    
458 wakaba 1.5 $self->{s_kwd} = '-';
459 wakaba 1.1 #
460     }
461     }
462    
463     #
464     } elsif ($self->{nc} == 0x0021) { # !
465     if (length $self->{s_kwd}) {
466    
467     $self->{s_kwd} .= '!';
468     #
469     } else {
470    
471     #$self->{s_kwd} = '';
472     #
473     }
474     #
475     } elsif ($self->{nc} == 0x003C) { # <
476     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
477     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
478     not $self->{escape})) {
479    
480     $self->{state} = TAG_OPEN_STATE;
481    
482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
483     $self->{line_prev} = $self->{line};
484     $self->{column_prev} = $self->{column};
485     $self->{column}++;
486     $self->{nc}
487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
488     } else {
489     $self->{set_nc}->($self);
490     }
491    
492     redo A;
493     } else {
494    
495     $self->{s_kwd} = '';
496     #
497     }
498     } elsif ($self->{nc} == 0x003E) { # >
499     if ($self->{escape} and
500     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
501     if ($self->{s_kwd} eq '--') {
502    
503     delete $self->{escape};
504 wakaba 1.5 #
505 wakaba 1.1 } else {
506    
507 wakaba 1.5 #
508 wakaba 1.1 }
509 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
510    
511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
512     line => $self->{line_prev},
513     column => $self->{column_prev} - 1);
514     #
515 wakaba 1.1 } else {
516    
517 wakaba 1.5 #
518 wakaba 1.1 }
519    
520     $self->{s_kwd} = '';
521     #
522 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
523     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
524    
525     $self->{s_kwd} .= ']';
526     } elsif ($self->{s_kwd} eq ']]') {
527    
528     #
529     } else {
530    
531     $self->{s_kwd} = '';
532     }
533     #
534 wakaba 1.1 } elsif ($self->{nc} == -1) {
535    
536     $self->{s_kwd} = '';
537     return ({type => END_OF_FILE_TOKEN,
538     line => $self->{line}, column => $self->{column}});
539     last A; ## TODO: ok?
540     } else {
541    
542     $self->{s_kwd} = '';
543     #
544     }
545    
546     # Anything else
547     my $token = {type => CHARACTER_TOKEN,
548     data => chr $self->{nc},
549     line => $self->{line}, column => $self->{column},
550     };
551 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
552 wakaba 1.1 length $token->{data})) {
553     $self->{s_kwd} = '';
554     }
555    
556     ## Stay in the data state.
557 wakaba 1.5 if (not $self->{is_xml} and
558     $self->{content_model} == PCDATA_CONTENT_MODEL) {
559 wakaba 1.1
560     $self->{state} = PCDATA_STATE;
561     } else {
562    
563     ## Stay in the state.
564     }
565    
566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
567     $self->{line_prev} = $self->{line};
568     $self->{column_prev} = $self->{column};
569     $self->{column}++;
570     $self->{nc}
571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
572     } else {
573     $self->{set_nc}->($self);
574     }
575    
576     return ($token);
577     redo A;
578     } elsif ($self->{state} == TAG_OPEN_STATE) {
579 wakaba 1.10 ## XML5: "tag state".
580    
581 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
582     if ($self->{nc} == 0x002F) { # /
583    
584    
585     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
586     $self->{line_prev} = $self->{line};
587     $self->{column_prev} = $self->{column};
588     $self->{column}++;
589     $self->{nc}
590     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
591     } else {
592     $self->{set_nc}->($self);
593     }
594    
595     $self->{state} = CLOSE_TAG_OPEN_STATE;
596     redo A;
597     } elsif ($self->{nc} == 0x0021) { # !
598    
599 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
600 wakaba 1.1 #
601     } else {
602    
603 wakaba 1.12 $self->{s_kwd} = '';
604 wakaba 1.1 #
605     }
606    
607     ## reconsume
608     $self->{state} = DATA_STATE;
609     return ({type => CHARACTER_TOKEN, data => '<',
610     line => $self->{line_prev},
611     column => $self->{column_prev},
612     });
613     redo A;
614     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
615     if ($self->{nc} == 0x0021) { # !
616    
617     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
618    
619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
620     $self->{line_prev} = $self->{line};
621     $self->{column_prev} = $self->{column};
622     $self->{column}++;
623     $self->{nc}
624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
625     } else {
626     $self->{set_nc}->($self);
627     }
628    
629     redo A;
630     } elsif ($self->{nc} == 0x002F) { # /
631    
632     $self->{state} = CLOSE_TAG_OPEN_STATE;
633    
634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
635     $self->{line_prev} = $self->{line};
636     $self->{column_prev} = $self->{column};
637     $self->{column}++;
638     $self->{nc}
639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
640     } else {
641     $self->{set_nc}->($self);
642     }
643    
644     redo A;
645     } elsif (0x0041 <= $self->{nc} and
646     $self->{nc} <= 0x005A) { # A..Z
647    
648     $self->{ct}
649     = {type => START_TAG_TOKEN,
650 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
651 wakaba 1.1 line => $self->{line_prev},
652     column => $self->{column_prev}};
653     $self->{state} = TAG_NAME_STATE;
654    
655     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
656     $self->{line_prev} = $self->{line};
657     $self->{column_prev} = $self->{column};
658     $self->{column}++;
659     $self->{nc}
660     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
661     } else {
662     $self->{set_nc}->($self);
663     }
664    
665     redo A;
666     } elsif (0x0061 <= $self->{nc} and
667     $self->{nc} <= 0x007A) { # a..z
668    
669     $self->{ct} = {type => START_TAG_TOKEN,
670     tag_name => chr ($self->{nc}),
671     line => $self->{line_prev},
672     column => $self->{column_prev}};
673     $self->{state} = TAG_NAME_STATE;
674    
675     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
676     $self->{line_prev} = $self->{line};
677     $self->{column_prev} = $self->{column};
678     $self->{column}++;
679     $self->{nc}
680     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
681     } else {
682     $self->{set_nc}->($self);
683     }
684    
685     redo A;
686     } elsif ($self->{nc} == 0x003E) { # >
687    
688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
689     line => $self->{line_prev},
690     column => $self->{column_prev});
691     $self->{state} = DATA_STATE;
692 wakaba 1.5 $self->{s_kwd} = '';
693 wakaba 1.1
694     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
695     $self->{line_prev} = $self->{line};
696     $self->{column_prev} = $self->{column};
697     $self->{column}++;
698     $self->{nc}
699     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
700     } else {
701     $self->{set_nc}->($self);
702     }
703    
704    
705     return ({type => CHARACTER_TOKEN, data => '<>',
706     line => $self->{line_prev},
707     column => $self->{column_prev},
708     });
709    
710     redo A;
711     } elsif ($self->{nc} == 0x003F) { # ?
712 wakaba 1.8 if ($self->{is_xml}) {
713    
714     $self->{state} = PI_STATE;
715    
716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
717     $self->{line_prev} = $self->{line};
718     $self->{column_prev} = $self->{column};
719     $self->{column}++;
720     $self->{nc}
721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
722     } else {
723     $self->{set_nc}->($self);
724     }
725    
726     redo A;
727     } else {
728    
729     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
730     line => $self->{line_prev},
731     column => $self->{column_prev});
732     $self->{state} = BOGUS_COMMENT_STATE;
733     $self->{ct} = {type => COMMENT_TOKEN, data => '',
734     line => $self->{line_prev},
735     column => $self->{column_prev},
736     };
737     ## $self->{nc} is intentionally left as is
738     redo A;
739     }
740 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
741 wakaba 1.1
742     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
743     line => $self->{line_prev},
744     column => $self->{column_prev});
745     $self->{state} = DATA_STATE;
746 wakaba 1.5 $self->{s_kwd} = '';
747 wakaba 1.1 ## reconsume
748    
749     return ({type => CHARACTER_TOKEN, data => '<',
750     line => $self->{line_prev},
751     column => $self->{column_prev},
752     });
753    
754     redo A;
755 wakaba 1.9 } else {
756     ## XML5: "<:" is a parse error.
757    
758     $self->{ct} = {type => START_TAG_TOKEN,
759     tag_name => chr ($self->{nc}),
760     line => $self->{line_prev},
761     column => $self->{column_prev}};
762     $self->{state} = TAG_NAME_STATE;
763    
764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
765     $self->{line_prev} = $self->{line};
766     $self->{column_prev} = $self->{column};
767     $self->{column}++;
768     $self->{nc}
769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
770     } else {
771     $self->{set_nc}->($self);
772     }
773    
774     redo A;
775 wakaba 1.1 }
776     } else {
777     die "$0: $self->{content_model} in tag open";
778     }
779     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
780     ## NOTE: The "close tag open state" in the spec is implemented as
781     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
782    
783 wakaba 1.10 ## XML5: "end tag state".
784    
785 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
786     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
787     if (defined $self->{last_stag_name}) {
788     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
789 wakaba 1.12 $self->{kwd} = '';
790 wakaba 1.1 ## Reconsume.
791     redo A;
792     } else {
793     ## No start tag token has ever been emitted
794     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
795    
796     $self->{state} = DATA_STATE;
797 wakaba 1.5 $self->{s_kwd} = '';
798 wakaba 1.1 ## Reconsume.
799     return ({type => CHARACTER_TOKEN, data => '</',
800     line => $l, column => $c,
801     });
802     redo A;
803     }
804     }
805    
806     if (0x0041 <= $self->{nc} and
807     $self->{nc} <= 0x005A) { # A..Z
808    
809     $self->{ct}
810     = {type => END_TAG_TOKEN,
811 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
812 wakaba 1.1 line => $l, column => $c};
813     $self->{state} = TAG_NAME_STATE;
814    
815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
816     $self->{line_prev} = $self->{line};
817     $self->{column_prev} = $self->{column};
818     $self->{column}++;
819     $self->{nc}
820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
821     } else {
822     $self->{set_nc}->($self);
823     }
824    
825     redo A;
826     } elsif (0x0061 <= $self->{nc} and
827     $self->{nc} <= 0x007A) { # a..z
828    
829     $self->{ct} = {type => END_TAG_TOKEN,
830     tag_name => chr ($self->{nc}),
831     line => $l, column => $c};
832     $self->{state} = TAG_NAME_STATE;
833    
834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
835     $self->{line_prev} = $self->{line};
836     $self->{column_prev} = $self->{column};
837     $self->{column}++;
838     $self->{nc}
839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
840     } else {
841     $self->{set_nc}->($self);
842     }
843    
844     redo A;
845     } elsif ($self->{nc} == 0x003E) { # >
846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
847     line => $self->{line_prev}, ## "<" in "</>"
848     column => $self->{column_prev} - 1);
849     $self->{state} = DATA_STATE;
850 wakaba 1.5 $self->{s_kwd} = '';
851 wakaba 1.10 if ($self->{is_xml}) {
852    
853     ## XML5: No parse error.
854    
855     ## NOTE: This parser raises a parse error, since it supports
856     ## XML1, not XML5.
857    
858     ## NOTE: A short end tag token.
859     my $ct = {type => END_TAG_TOKEN,
860     tag_name => '',
861     line => $self->{line_prev},
862     column => $self->{column_prev} - 1,
863     };
864    
865     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
866     $self->{line_prev} = $self->{line};
867     $self->{column_prev} = $self->{column};
868     $self->{column}++;
869     $self->{nc}
870     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
871     } else {
872     $self->{set_nc}->($self);
873     }
874    
875     return ($ct);
876     } else {
877    
878    
879 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
880     $self->{line_prev} = $self->{line};
881     $self->{column_prev} = $self->{column};
882     $self->{column}++;
883     $self->{nc}
884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
885     } else {
886     $self->{set_nc}->($self);
887     }
888    
889 wakaba 1.10 }
890 wakaba 1.1 redo A;
891     } elsif ($self->{nc} == -1) {
892    
893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
894 wakaba 1.5 $self->{s_kwd} = '';
895 wakaba 1.1 $self->{state} = DATA_STATE;
896     # reconsume
897    
898     return ({type => CHARACTER_TOKEN, data => '</',
899     line => $l, column => $c,
900     });
901    
902     redo A;
903 wakaba 1.10 } elsif (not $self->{is_xml} or
904     $is_space->{$self->{nc}}) {
905 wakaba 1.1
906 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
907     line => $self->{line_prev}, # "<" of "</"
908     column => $self->{column_prev} - 1);
909 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
910     $self->{ct} = {type => COMMENT_TOKEN, data => '',
911     line => $self->{line_prev}, # "<" of "</"
912     column => $self->{column_prev} - 1,
913     };
914     ## NOTE: $self->{nc} is intentionally left as is.
915     ## Although the "anything else" case of the spec not explicitly
916     ## states that the next input character is to be reconsumed,
917     ## it will be included to the |data| of the comment token
918     ## generated from the bogus end tag, as defined in the
919     ## "bogus comment state" entry.
920     redo A;
921 wakaba 1.10 } else {
922     ## XML5: "</:" is a parse error.
923    
924     $self->{ct} = {type => END_TAG_TOKEN,
925     tag_name => chr ($self->{nc}),
926     line => $l, column => $c};
927     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
928    
929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
930     $self->{line_prev} = $self->{line};
931     $self->{column_prev} = $self->{column};
932     $self->{column}++;
933     $self->{nc}
934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
935     } else {
936     $self->{set_nc}->($self);
937     }
938    
939     redo A;
940 wakaba 1.1 }
941     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
942 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
943 wakaba 1.1 if (length $ch) {
944     my $CH = $ch;
945     $ch =~ tr/a-z/A-Z/;
946     my $nch = chr $self->{nc};
947     if ($nch eq $ch or $nch eq $CH) {
948    
949     ## Stay in the state.
950 wakaba 1.12 $self->{kwd} .= $nch;
951 wakaba 1.1
952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
953     $self->{line_prev} = $self->{line};
954     $self->{column_prev} = $self->{column};
955     $self->{column}++;
956     $self->{nc}
957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
958     } else {
959     $self->{set_nc}->($self);
960     }
961    
962     redo A;
963     } else {
964    
965     $self->{state} = DATA_STATE;
966 wakaba 1.5 $self->{s_kwd} = '';
967 wakaba 1.1 ## Reconsume.
968     return ({type => CHARACTER_TOKEN,
969 wakaba 1.12 data => '</' . $self->{kwd},
970 wakaba 1.1 line => $self->{line_prev},
971 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
972 wakaba 1.1 });
973     redo A;
974     }
975     } else { # after "<{tag-name}"
976     unless ($is_space->{$self->{nc}} or
977     {
978     0x003E => 1, # >
979     0x002F => 1, # /
980     -1 => 1, # EOF
981     }->{$self->{nc}}) {
982    
983     ## Reconsume.
984     $self->{state} = DATA_STATE;
985 wakaba 1.5 $self->{s_kwd} = '';
986 wakaba 1.1 return ({type => CHARACTER_TOKEN,
987 wakaba 1.12 data => '</' . $self->{kwd},
988 wakaba 1.1 line => $self->{line_prev},
989 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
990 wakaba 1.1 });
991     redo A;
992     } else {
993    
994     $self->{ct}
995     = {type => END_TAG_TOKEN,
996     tag_name => $self->{last_stag_name},
997     line => $self->{line_prev},
998 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
999 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1000     ## Reconsume.
1001     redo A;
1002     }
1003     }
1004     } elsif ($self->{state} == TAG_NAME_STATE) {
1005     if ($is_space->{$self->{nc}}) {
1006    
1007     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1008    
1009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1010     $self->{line_prev} = $self->{line};
1011     $self->{column_prev} = $self->{column};
1012     $self->{column}++;
1013     $self->{nc}
1014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1015     } else {
1016     $self->{set_nc}->($self);
1017     }
1018    
1019     redo A;
1020     } elsif ($self->{nc} == 0x003E) { # >
1021     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1022    
1023     $self->{last_stag_name} = $self->{ct}->{tag_name};
1024     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1025     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1026     #if ($self->{ct}->{attributes}) {
1027     # ## NOTE: This should never be reached.
1028     # !!! cp (36);
1029     # !!! parse-error (type => 'end tag attribute');
1030     #} else {
1031    
1032     #}
1033     } else {
1034     die "$0: $self->{ct}->{type}: Unknown token type";
1035     }
1036     $self->{state} = DATA_STATE;
1037 wakaba 1.5 $self->{s_kwd} = '';
1038 wakaba 1.1
1039     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1040     $self->{line_prev} = $self->{line};
1041     $self->{column_prev} = $self->{column};
1042     $self->{column}++;
1043     $self->{nc}
1044     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1045     } else {
1046     $self->{set_nc}->($self);
1047     }
1048    
1049    
1050     return ($self->{ct}); # start tag or end tag
1051    
1052     redo A;
1053     } elsif (0x0041 <= $self->{nc} and
1054     $self->{nc} <= 0x005A) { # A..Z
1055    
1056 wakaba 1.4 $self->{ct}->{tag_name}
1057     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1058 wakaba 1.1 # start tag or end tag
1059     ## Stay in this state
1060    
1061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1062     $self->{line_prev} = $self->{line};
1063     $self->{column_prev} = $self->{column};
1064     $self->{column}++;
1065     $self->{nc}
1066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1067     } else {
1068     $self->{set_nc}->($self);
1069     }
1070    
1071     redo A;
1072     } elsif ($self->{nc} == -1) {
1073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1074     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1075    
1076     $self->{last_stag_name} = $self->{ct}->{tag_name};
1077     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1078     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1079     #if ($self->{ct}->{attributes}) {
1080     # ## NOTE: This state should never be reached.
1081     # !!! cp (40);
1082     # !!! parse-error (type => 'end tag attribute');
1083     #} else {
1084    
1085     #}
1086     } else {
1087     die "$0: $self->{ct}->{type}: Unknown token type";
1088     }
1089     $self->{state} = DATA_STATE;
1090 wakaba 1.5 $self->{s_kwd} = '';
1091 wakaba 1.1 # reconsume
1092    
1093     return ($self->{ct}); # start tag or end tag
1094    
1095     redo A;
1096     } elsif ($self->{nc} == 0x002F) { # /
1097    
1098     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1099    
1100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1101     $self->{line_prev} = $self->{line};
1102     $self->{column_prev} = $self->{column};
1103     $self->{column}++;
1104     $self->{nc}
1105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1106     } else {
1107     $self->{set_nc}->($self);
1108     }
1109    
1110     redo A;
1111     } else {
1112    
1113     $self->{ct}->{tag_name} .= chr $self->{nc};
1114     # start tag or end tag
1115     ## Stay in the state
1116    
1117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1118     $self->{line_prev} = $self->{line};
1119     $self->{column_prev} = $self->{column};
1120     $self->{column}++;
1121     $self->{nc}
1122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1123     } else {
1124     $self->{set_nc}->($self);
1125     }
1126    
1127     redo A;
1128     }
1129     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1130 wakaba 1.11 ## XML5: "Tag attribute name before state".
1131    
1132 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1133    
1134     ## Stay in the state
1135    
1136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1137     $self->{line_prev} = $self->{line};
1138     $self->{column_prev} = $self->{column};
1139     $self->{column}++;
1140     $self->{nc}
1141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1142     } else {
1143     $self->{set_nc}->($self);
1144     }
1145    
1146     redo A;
1147     } elsif ($self->{nc} == 0x003E) { # >
1148     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1149    
1150     $self->{last_stag_name} = $self->{ct}->{tag_name};
1151     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1152     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1153     if ($self->{ct}->{attributes}) {
1154    
1155     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1156     } else {
1157    
1158     }
1159     } else {
1160     die "$0: $self->{ct}->{type}: Unknown token type";
1161     }
1162     $self->{state} = DATA_STATE;
1163 wakaba 1.5 $self->{s_kwd} = '';
1164 wakaba 1.1
1165     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1166     $self->{line_prev} = $self->{line};
1167     $self->{column_prev} = $self->{column};
1168     $self->{column}++;
1169     $self->{nc}
1170     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1171     } else {
1172     $self->{set_nc}->($self);
1173     }
1174    
1175    
1176     return ($self->{ct}); # start tag or end tag
1177    
1178     redo A;
1179     } elsif (0x0041 <= $self->{nc} and
1180     $self->{nc} <= 0x005A) { # A..Z
1181    
1182     $self->{ca}
1183 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1184 wakaba 1.1 value => '',
1185     line => $self->{line}, column => $self->{column}};
1186     $self->{state} = ATTRIBUTE_NAME_STATE;
1187    
1188     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1189     $self->{line_prev} = $self->{line};
1190     $self->{column_prev} = $self->{column};
1191     $self->{column}++;
1192     $self->{nc}
1193     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1194     } else {
1195     $self->{set_nc}->($self);
1196     }
1197    
1198     redo A;
1199     } elsif ($self->{nc} == 0x002F) { # /
1200    
1201     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1202    
1203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1204     $self->{line_prev} = $self->{line};
1205     $self->{column_prev} = $self->{column};
1206     $self->{column}++;
1207     $self->{nc}
1208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1209     } else {
1210     $self->{set_nc}->($self);
1211     }
1212    
1213     redo A;
1214     } elsif ($self->{nc} == -1) {
1215     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1216     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1217    
1218     $self->{last_stag_name} = $self->{ct}->{tag_name};
1219     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1220     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1221     if ($self->{ct}->{attributes}) {
1222    
1223     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1224     } else {
1225    
1226     }
1227     } else {
1228     die "$0: $self->{ct}->{type}: Unknown token type";
1229     }
1230     $self->{state} = DATA_STATE;
1231 wakaba 1.5 $self->{s_kwd} = '';
1232 wakaba 1.1 # reconsume
1233    
1234     return ($self->{ct}); # start tag or end tag
1235    
1236     redo A;
1237     } else {
1238     if ({
1239     0x0022 => 1, # "
1240     0x0027 => 1, # '
1241     0x003D => 1, # =
1242     }->{$self->{nc}}) {
1243    
1244 wakaba 1.11 ## XML5: Not a parse error.
1245 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1246     } else {
1247    
1248 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1249 wakaba 1.1 }
1250     $self->{ca}
1251     = {name => chr ($self->{nc}),
1252     value => '',
1253     line => $self->{line}, column => $self->{column}};
1254     $self->{state} = ATTRIBUTE_NAME_STATE;
1255    
1256     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1257     $self->{line_prev} = $self->{line};
1258     $self->{column_prev} = $self->{column};
1259     $self->{column}++;
1260     $self->{nc}
1261     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1262     } else {
1263     $self->{set_nc}->($self);
1264     }
1265    
1266     redo A;
1267     }
1268     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1269 wakaba 1.11 ## XML5: "Tag attribute name state".
1270    
1271 wakaba 1.1 my $before_leave = sub {
1272     if (exists $self->{ct}->{attributes} # start tag or end tag
1273     ->{$self->{ca}->{name}}) { # MUST
1274    
1275     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1276     ## Discard $self->{ca} # MUST
1277     } else {
1278    
1279     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1280     = $self->{ca};
1281 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1282 wakaba 1.1 }
1283     }; # $before_leave
1284    
1285     if ($is_space->{$self->{nc}}) {
1286    
1287     $before_leave->();
1288     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1289    
1290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1291     $self->{line_prev} = $self->{line};
1292     $self->{column_prev} = $self->{column};
1293     $self->{column}++;
1294     $self->{nc}
1295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1296     } else {
1297     $self->{set_nc}->($self);
1298     }
1299    
1300     redo A;
1301     } elsif ($self->{nc} == 0x003D) { # =
1302    
1303     $before_leave->();
1304     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1305    
1306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307     $self->{line_prev} = $self->{line};
1308     $self->{column_prev} = $self->{column};
1309     $self->{column}++;
1310     $self->{nc}
1311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312     } else {
1313     $self->{set_nc}->($self);
1314     }
1315    
1316     redo A;
1317     } elsif ($self->{nc} == 0x003E) { # >
1318 wakaba 1.11 if ($self->{is_xml}) {
1319    
1320     ## XML5: Not a parse error.
1321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1322     } else {
1323    
1324     }
1325    
1326 wakaba 1.1 $before_leave->();
1327     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1328    
1329     $self->{last_stag_name} = $self->{ct}->{tag_name};
1330     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1331    
1332     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1333     if ($self->{ct}->{attributes}) {
1334     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1335     }
1336     } else {
1337     die "$0: $self->{ct}->{type}: Unknown token type";
1338     }
1339     $self->{state} = DATA_STATE;
1340 wakaba 1.5 $self->{s_kwd} = '';
1341 wakaba 1.1
1342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1343     $self->{line_prev} = $self->{line};
1344     $self->{column_prev} = $self->{column};
1345     $self->{column}++;
1346     $self->{nc}
1347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1348     } else {
1349     $self->{set_nc}->($self);
1350     }
1351    
1352    
1353     return ($self->{ct}); # start tag or end tag
1354    
1355     redo A;
1356     } elsif (0x0041 <= $self->{nc} and
1357     $self->{nc} <= 0x005A) { # A..Z
1358    
1359 wakaba 1.4 $self->{ca}->{name}
1360     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1361 wakaba 1.1 ## Stay in the state
1362    
1363     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1364     $self->{line_prev} = $self->{line};
1365     $self->{column_prev} = $self->{column};
1366     $self->{column}++;
1367     $self->{nc}
1368     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1369     } else {
1370     $self->{set_nc}->($self);
1371     }
1372    
1373     redo A;
1374     } elsif ($self->{nc} == 0x002F) { # /
1375 wakaba 1.11 if ($self->{is_xml}) {
1376    
1377     ## XML5: Not a parse error.
1378     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1379     } else {
1380    
1381     }
1382 wakaba 1.1
1383     $before_leave->();
1384     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1385    
1386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1387     $self->{line_prev} = $self->{line};
1388     $self->{column_prev} = $self->{column};
1389     $self->{column}++;
1390     $self->{nc}
1391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1392     } else {
1393     $self->{set_nc}->($self);
1394     }
1395    
1396     redo A;
1397     } elsif ($self->{nc} == -1) {
1398     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1399     $before_leave->();
1400     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1401    
1402     $self->{last_stag_name} = $self->{ct}->{tag_name};
1403     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1404     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1405     if ($self->{ct}->{attributes}) {
1406    
1407     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1408     } else {
1409     ## NOTE: This state should never be reached.
1410    
1411     }
1412     } else {
1413     die "$0: $self->{ct}->{type}: Unknown token type";
1414     }
1415     $self->{state} = DATA_STATE;
1416 wakaba 1.5 $self->{s_kwd} = '';
1417 wakaba 1.1 # reconsume
1418    
1419     return ($self->{ct}); # start tag or end tag
1420    
1421     redo A;
1422     } else {
1423     if ($self->{nc} == 0x0022 or # "
1424     $self->{nc} == 0x0027) { # '
1425    
1426 wakaba 1.11 ## XML5: Not a parse error.
1427 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1428     } else {
1429    
1430     }
1431     $self->{ca}->{name} .= chr ($self->{nc});
1432     ## Stay in the state
1433    
1434     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1435     $self->{line_prev} = $self->{line};
1436     $self->{column_prev} = $self->{column};
1437     $self->{column}++;
1438     $self->{nc}
1439     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1440     } else {
1441     $self->{set_nc}->($self);
1442     }
1443    
1444     redo A;
1445     }
1446     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1447 wakaba 1.11 ## XML5: "Tag attribute name after state".
1448    
1449 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1450    
1451     ## Stay in the state
1452    
1453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1454     $self->{line_prev} = $self->{line};
1455     $self->{column_prev} = $self->{column};
1456     $self->{column}++;
1457     $self->{nc}
1458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1459     } else {
1460     $self->{set_nc}->($self);
1461     }
1462    
1463     redo A;
1464     } elsif ($self->{nc} == 0x003D) { # =
1465    
1466     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1467    
1468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1469     $self->{line_prev} = $self->{line};
1470     $self->{column_prev} = $self->{column};
1471     $self->{column}++;
1472     $self->{nc}
1473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1474     } else {
1475     $self->{set_nc}->($self);
1476     }
1477    
1478     redo A;
1479     } elsif ($self->{nc} == 0x003E) { # >
1480 wakaba 1.11 if ($self->{is_xml}) {
1481    
1482     ## XML5: Not a parse error.
1483     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1484     } else {
1485    
1486     }
1487    
1488 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1489    
1490     $self->{last_stag_name} = $self->{ct}->{tag_name};
1491     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1492     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1493     if ($self->{ct}->{attributes}) {
1494    
1495     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1496     } else {
1497     ## NOTE: This state should never be reached.
1498    
1499     }
1500     } else {
1501     die "$0: $self->{ct}->{type}: Unknown token type";
1502     }
1503     $self->{state} = DATA_STATE;
1504 wakaba 1.5 $self->{s_kwd} = '';
1505 wakaba 1.1
1506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1507     $self->{line_prev} = $self->{line};
1508     $self->{column_prev} = $self->{column};
1509     $self->{column}++;
1510     $self->{nc}
1511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1512     } else {
1513     $self->{set_nc}->($self);
1514     }
1515    
1516    
1517     return ($self->{ct}); # start tag or end tag
1518    
1519     redo A;
1520     } elsif (0x0041 <= $self->{nc} and
1521     $self->{nc} <= 0x005A) { # A..Z
1522    
1523     $self->{ca}
1524 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1525 wakaba 1.1 value => '',
1526     line => $self->{line}, column => $self->{column}};
1527     $self->{state} = ATTRIBUTE_NAME_STATE;
1528    
1529     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1530     $self->{line_prev} = $self->{line};
1531     $self->{column_prev} = $self->{column};
1532     $self->{column}++;
1533     $self->{nc}
1534     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1535     } else {
1536     $self->{set_nc}->($self);
1537     }
1538    
1539     redo A;
1540     } elsif ($self->{nc} == 0x002F) { # /
1541 wakaba 1.11 if ($self->{is_xml}) {
1542    
1543     ## XML5: Not a parse error.
1544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1545     } else {
1546    
1547     }
1548 wakaba 1.1
1549     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1550    
1551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1552     $self->{line_prev} = $self->{line};
1553     $self->{column_prev} = $self->{column};
1554     $self->{column}++;
1555     $self->{nc}
1556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1557     } else {
1558     $self->{set_nc}->($self);
1559     }
1560    
1561     redo A;
1562     } elsif ($self->{nc} == -1) {
1563     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1564     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1565    
1566     $self->{last_stag_name} = $self->{ct}->{tag_name};
1567     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1568     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1569     if ($self->{ct}->{attributes}) {
1570    
1571     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1572     } else {
1573     ## NOTE: This state should never be reached.
1574    
1575     }
1576     } else {
1577     die "$0: $self->{ct}->{type}: Unknown token type";
1578     }
1579 wakaba 1.5 $self->{s_kwd} = '';
1580 wakaba 1.1 $self->{state} = DATA_STATE;
1581     # reconsume
1582    
1583     return ($self->{ct}); # start tag or end tag
1584    
1585     redo A;
1586     } else {
1587 wakaba 1.11 if ($self->{is_xml}) {
1588    
1589     ## XML5: Not a parse error.
1590     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1591     } else {
1592    
1593     }
1594    
1595 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1596     $self->{nc} == 0x0027) { # '
1597    
1598 wakaba 1.11 ## XML5: Not a parse error.
1599 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1600     } else {
1601    
1602     }
1603     $self->{ca}
1604     = {name => chr ($self->{nc}),
1605     value => '',
1606     line => $self->{line}, column => $self->{column}};
1607     $self->{state} = ATTRIBUTE_NAME_STATE;
1608    
1609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1610     $self->{line_prev} = $self->{line};
1611     $self->{column_prev} = $self->{column};
1612     $self->{column}++;
1613     $self->{nc}
1614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1615     } else {
1616     $self->{set_nc}->($self);
1617     }
1618    
1619     redo A;
1620     }
1621     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1622 wakaba 1.11 ## XML5: "Tag attribute value before state".
1623    
1624 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1625    
1626     ## Stay in the state
1627    
1628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629     $self->{line_prev} = $self->{line};
1630     $self->{column_prev} = $self->{column};
1631     $self->{column}++;
1632     $self->{nc}
1633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1634     } else {
1635     $self->{set_nc}->($self);
1636     }
1637    
1638     redo A;
1639     } elsif ($self->{nc} == 0x0022) { # "
1640    
1641     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1642    
1643     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644     $self->{line_prev} = $self->{line};
1645     $self->{column_prev} = $self->{column};
1646     $self->{column}++;
1647     $self->{nc}
1648     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649     } else {
1650     $self->{set_nc}->($self);
1651     }
1652    
1653     redo A;
1654     } elsif ($self->{nc} == 0x0026) { # &
1655    
1656     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1657     ## reconsume
1658     redo A;
1659     } elsif ($self->{nc} == 0x0027) { # '
1660    
1661     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1662    
1663     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1664     $self->{line_prev} = $self->{line};
1665     $self->{column_prev} = $self->{column};
1666     $self->{column}++;
1667     $self->{nc}
1668     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1669     } else {
1670     $self->{set_nc}->($self);
1671     }
1672    
1673     redo A;
1674     } elsif ($self->{nc} == 0x003E) { # >
1675     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1676     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1677    
1678     $self->{last_stag_name} = $self->{ct}->{tag_name};
1679     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1680     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1681     if ($self->{ct}->{attributes}) {
1682    
1683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1684     } else {
1685     ## NOTE: This state should never be reached.
1686    
1687     }
1688     } else {
1689     die "$0: $self->{ct}->{type}: Unknown token type";
1690     }
1691     $self->{state} = DATA_STATE;
1692 wakaba 1.5 $self->{s_kwd} = '';
1693 wakaba 1.1
1694     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1695     $self->{line_prev} = $self->{line};
1696     $self->{column_prev} = $self->{column};
1697     $self->{column}++;
1698     $self->{nc}
1699     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1700     } else {
1701     $self->{set_nc}->($self);
1702     }
1703    
1704    
1705     return ($self->{ct}); # start tag or end tag
1706    
1707     redo A;
1708     } elsif ($self->{nc} == -1) {
1709     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1710     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1711    
1712     $self->{last_stag_name} = $self->{ct}->{tag_name};
1713     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1714     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1715     if ($self->{ct}->{attributes}) {
1716    
1717     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1718     } else {
1719     ## NOTE: This state should never be reached.
1720    
1721     }
1722     } else {
1723     die "$0: $self->{ct}->{type}: Unknown token type";
1724     }
1725     $self->{state} = DATA_STATE;
1726 wakaba 1.5 $self->{s_kwd} = '';
1727 wakaba 1.1 ## reconsume
1728    
1729     return ($self->{ct}); # start tag or end tag
1730    
1731     redo A;
1732     } else {
1733     if ($self->{nc} == 0x003D) { # =
1734    
1735 wakaba 1.11 ## XML5: Not a parse error.
1736 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1737 wakaba 1.11 } elsif ($self->{is_xml}) {
1738    
1739     ## XML5: No parse error.
1740     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1741 wakaba 1.1 } else {
1742    
1743     }
1744     $self->{ca}->{value} .= chr ($self->{nc});
1745     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1746    
1747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1748     $self->{line_prev} = $self->{line};
1749     $self->{column_prev} = $self->{column};
1750     $self->{column}++;
1751     $self->{nc}
1752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1753     } else {
1754     $self->{set_nc}->($self);
1755     }
1756    
1757     redo A;
1758     }
1759     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1760 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1761     ## ATTLIST attribute value double quoted state".
1762 wakaba 1.11
1763 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1764 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1765    
1766     ## XML5: "DOCTYPE ATTLIST name after state".
1767     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1768     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1769     } else {
1770    
1771     ## XML5: "Tag attribute name before state".
1772     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1773     }
1774 wakaba 1.1
1775     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1776     $self->{line_prev} = $self->{line};
1777     $self->{column_prev} = $self->{column};
1778     $self->{column}++;
1779     $self->{nc}
1780     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1781     } else {
1782     $self->{set_nc}->($self);
1783     }
1784    
1785     redo A;
1786     } elsif ($self->{nc} == 0x0026) { # &
1787    
1788 wakaba 1.11 ## XML5: Not defined yet.
1789    
1790 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1791     ## "entity in attribute value state". In this implementation, the
1792     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1793     ## implementation of the "consume a character reference" algorithm.
1794     $self->{prev_state} = $self->{state};
1795     $self->{entity_add} = 0x0022; # "
1796     $self->{state} = ENTITY_STATE;
1797    
1798     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1799     $self->{line_prev} = $self->{line};
1800     $self->{column_prev} = $self->{column};
1801     $self->{column}++;
1802     $self->{nc}
1803     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1804     } else {
1805     $self->{set_nc}->($self);
1806     }
1807    
1808     redo A;
1809     } elsif ($self->{nc} == -1) {
1810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1811     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1812    
1813     $self->{last_stag_name} = $self->{ct}->{tag_name};
1814 wakaba 1.15
1815     $self->{state} = DATA_STATE;
1816     $self->{s_kwd} = '';
1817     ## reconsume
1818     return ($self->{ct}); # start tag
1819     redo A;
1820 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1821     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1822     if ($self->{ct}->{attributes}) {
1823    
1824     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1825     } else {
1826     ## NOTE: This state should never be reached.
1827    
1828     }
1829 wakaba 1.15
1830     $self->{state} = DATA_STATE;
1831     $self->{s_kwd} = '';
1832     ## reconsume
1833     return ($self->{ct}); # end tag
1834     redo A;
1835     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1836     ## XML5: No parse error above; not defined yet.
1837     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1838     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1839     ## Reconsume.
1840     return ($self->{ct}); # ATTLIST
1841     redo A;
1842 wakaba 1.1 } else {
1843     die "$0: $self->{ct}->{type}: Unknown token type";
1844     }
1845     } else {
1846 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1847 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1848    
1849     ## XML5: Not a parse error.
1850     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1851     } else {
1852    
1853     }
1854 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1855     $self->{read_until}->($self->{ca}->{value},
1856 wakaba 1.11 q["&<],
1857 wakaba 1.1 length $self->{ca}->{value});
1858    
1859     ## Stay in the state
1860    
1861     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1862     $self->{line_prev} = $self->{line};
1863     $self->{column_prev} = $self->{column};
1864     $self->{column}++;
1865     $self->{nc}
1866     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1867     } else {
1868     $self->{set_nc}->($self);
1869     }
1870    
1871     redo A;
1872     }
1873     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1874 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1875     ## ATTLIST attribute value single quoted state".
1876 wakaba 1.11
1877 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1878 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1879    
1880     ## XML5: "DOCTYPE ATTLIST name after state".
1881     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1883     } else {
1884    
1885     ## XML5: "Before attribute name state" (sic).
1886     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1887     }
1888 wakaba 1.1
1889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1890     $self->{line_prev} = $self->{line};
1891     $self->{column_prev} = $self->{column};
1892     $self->{column}++;
1893     $self->{nc}
1894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1895     } else {
1896     $self->{set_nc}->($self);
1897     }
1898    
1899     redo A;
1900     } elsif ($self->{nc} == 0x0026) { # &
1901    
1902 wakaba 1.11 ## XML5: Not defined yet.
1903    
1904 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1905     ## "entity in attribute value state". In this implementation, the
1906     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1907     ## implementation of the "consume a character reference" algorithm.
1908     $self->{entity_add} = 0x0027; # '
1909     $self->{prev_state} = $self->{state};
1910     $self->{state} = ENTITY_STATE;
1911    
1912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1913     $self->{line_prev} = $self->{line};
1914     $self->{column_prev} = $self->{column};
1915     $self->{column}++;
1916     $self->{nc}
1917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1918     } else {
1919     $self->{set_nc}->($self);
1920     }
1921    
1922     redo A;
1923     } elsif ($self->{nc} == -1) {
1924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1925     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1926    
1927     $self->{last_stag_name} = $self->{ct}->{tag_name};
1928 wakaba 1.15
1929     $self->{state} = DATA_STATE;
1930     $self->{s_kwd} = '';
1931     ## reconsume
1932     return ($self->{ct}); # start tag
1933     redo A;
1934 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1935     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1936     if ($self->{ct}->{attributes}) {
1937    
1938     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1939     } else {
1940     ## NOTE: This state should never be reached.
1941    
1942     }
1943 wakaba 1.15
1944     $self->{state} = DATA_STATE;
1945     $self->{s_kwd} = '';
1946     ## reconsume
1947     return ($self->{ct}); # end tag
1948     redo A;
1949     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1950     ## XML5: No parse error above; not defined yet.
1951     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1952     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1953     ## Reconsume.
1954     return ($self->{ct}); # ATTLIST
1955     redo A;
1956 wakaba 1.1 } else {
1957     die "$0: $self->{ct}->{type}: Unknown token type";
1958     }
1959     } else {
1960 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1961 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1962    
1963     ## XML5: Not a parse error.
1964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1965     } else {
1966    
1967     }
1968 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1969     $self->{read_until}->($self->{ca}->{value},
1970 wakaba 1.11 q['&<],
1971 wakaba 1.1 length $self->{ca}->{value});
1972    
1973     ## Stay in the state
1974    
1975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1976     $self->{line_prev} = $self->{line};
1977     $self->{column_prev} = $self->{column};
1978     $self->{column}++;
1979     $self->{nc}
1980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1981     } else {
1982     $self->{set_nc}->($self);
1983     }
1984    
1985     redo A;
1986     }
1987     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1988 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1989    
1990 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1991 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1992    
1993     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1994     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1995     } else {
1996    
1997     ## XML5: "Tag attribute name before state".
1998     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1999     }
2000 wakaba 1.1
2001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2002     $self->{line_prev} = $self->{line};
2003     $self->{column_prev} = $self->{column};
2004     $self->{column}++;
2005     $self->{nc}
2006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2007     } else {
2008     $self->{set_nc}->($self);
2009     }
2010    
2011     redo A;
2012     } elsif ($self->{nc} == 0x0026) { # &
2013    
2014 wakaba 1.11
2015     ## XML5: Not defined yet.
2016    
2017 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2018     ## "entity in attribute value state". In this implementation, the
2019     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2020     ## implementation of the "consume a character reference" algorithm.
2021     $self->{entity_add} = -1;
2022     $self->{prev_state} = $self->{state};
2023     $self->{state} = ENTITY_STATE;
2024    
2025     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2026     $self->{line_prev} = $self->{line};
2027     $self->{column_prev} = $self->{column};
2028     $self->{column}++;
2029     $self->{nc}
2030     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2031     } else {
2032     $self->{set_nc}->($self);
2033     }
2034    
2035     redo A;
2036     } elsif ($self->{nc} == 0x003E) { # >
2037     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2038    
2039     $self->{last_stag_name} = $self->{ct}->{tag_name};
2040 wakaba 1.15
2041     $self->{state} = DATA_STATE;
2042     $self->{s_kwd} = '';
2043    
2044     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2045     $self->{line_prev} = $self->{line};
2046     $self->{column_prev} = $self->{column};
2047     $self->{column}++;
2048     $self->{nc}
2049     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2050     } else {
2051     $self->{set_nc}->($self);
2052     }
2053    
2054     return ($self->{ct}); # start tag
2055     redo A;
2056 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2057     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2058     if ($self->{ct}->{attributes}) {
2059    
2060     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2061     } else {
2062     ## NOTE: This state should never be reached.
2063    
2064     }
2065 wakaba 1.15
2066     $self->{state} = DATA_STATE;
2067     $self->{s_kwd} = '';
2068    
2069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070     $self->{line_prev} = $self->{line};
2071     $self->{column_prev} = $self->{column};
2072     $self->{column}++;
2073     $self->{nc}
2074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2075     } else {
2076     $self->{set_nc}->($self);
2077     }
2078    
2079     return ($self->{ct}); # end tag
2080     redo A;
2081     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2082     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2083     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2084    
2085 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2086     $self->{line_prev} = $self->{line};
2087     $self->{column_prev} = $self->{column};
2088     $self->{column}++;
2089     $self->{nc}
2090     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2091     } else {
2092     $self->{set_nc}->($self);
2093     }
2094    
2095 wakaba 1.15 return ($self->{ct}); # ATTLIST
2096     redo A;
2097     } else {
2098     die "$0: $self->{ct}->{type}: Unknown token type";
2099     }
2100 wakaba 1.1 } elsif ($self->{nc} == -1) {
2101     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2102    
2103 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2104 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2105 wakaba 1.15
2106     $self->{state} = DATA_STATE;
2107     $self->{s_kwd} = '';
2108     ## reconsume
2109     return ($self->{ct}); # start tag
2110     redo A;
2111 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2112 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2113 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2114     if ($self->{ct}->{attributes}) {
2115    
2116     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2117     } else {
2118     ## NOTE: This state should never be reached.
2119    
2120     }
2121 wakaba 1.15
2122     $self->{state} = DATA_STATE;
2123     $self->{s_kwd} = '';
2124     ## reconsume
2125     return ($self->{ct}); # end tag
2126     redo A;
2127     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2129     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2130     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2131     ## Reconsume.
2132     return ($self->{ct}); # ATTLIST
2133     redo A;
2134 wakaba 1.1 } else {
2135     die "$0: $self->{ct}->{type}: Unknown token type";
2136     }
2137     } else {
2138     if ({
2139     0x0022 => 1, # "
2140     0x0027 => 1, # '
2141     0x003D => 1, # =
2142     }->{$self->{nc}}) {
2143    
2144 wakaba 1.11 ## XML5: Not a parse error.
2145 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2146     } else {
2147    
2148     }
2149     $self->{ca}->{value} .= chr ($self->{nc});
2150     $self->{read_until}->($self->{ca}->{value},
2151     q["'=& >],
2152     length $self->{ca}->{value});
2153    
2154     ## Stay in the state
2155    
2156     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2157     $self->{line_prev} = $self->{line};
2158     $self->{column_prev} = $self->{column};
2159     $self->{column}++;
2160     $self->{nc}
2161     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2162     } else {
2163     $self->{set_nc}->($self);
2164     }
2165    
2166     redo A;
2167     }
2168     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2169     if ($is_space->{$self->{nc}}) {
2170    
2171     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2172    
2173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2174     $self->{line_prev} = $self->{line};
2175     $self->{column_prev} = $self->{column};
2176     $self->{column}++;
2177     $self->{nc}
2178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2179     } else {
2180     $self->{set_nc}->($self);
2181     }
2182    
2183     redo A;
2184     } elsif ($self->{nc} == 0x003E) { # >
2185     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2186    
2187     $self->{last_stag_name} = $self->{ct}->{tag_name};
2188     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2189     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2190     if ($self->{ct}->{attributes}) {
2191    
2192     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2193     } else {
2194     ## NOTE: This state should never be reached.
2195    
2196     }
2197     } else {
2198     die "$0: $self->{ct}->{type}: Unknown token type";
2199     }
2200     $self->{state} = DATA_STATE;
2201 wakaba 1.5 $self->{s_kwd} = '';
2202 wakaba 1.1
2203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2204     $self->{line_prev} = $self->{line};
2205     $self->{column_prev} = $self->{column};
2206     $self->{column}++;
2207     $self->{nc}
2208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2209     } else {
2210     $self->{set_nc}->($self);
2211     }
2212    
2213    
2214     return ($self->{ct}); # start tag or end tag
2215    
2216     redo A;
2217     } elsif ($self->{nc} == 0x002F) { # /
2218    
2219     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2220    
2221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2222     $self->{line_prev} = $self->{line};
2223     $self->{column_prev} = $self->{column};
2224     $self->{column}++;
2225     $self->{nc}
2226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2227     } else {
2228     $self->{set_nc}->($self);
2229     }
2230    
2231     redo A;
2232     } elsif ($self->{nc} == -1) {
2233     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2234     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2235    
2236     $self->{last_stag_name} = $self->{ct}->{tag_name};
2237     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2238     if ($self->{ct}->{attributes}) {
2239    
2240     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2241     } else {
2242     ## NOTE: This state should never be reached.
2243    
2244     }
2245     } else {
2246     die "$0: $self->{ct}->{type}: Unknown token type";
2247     }
2248     $self->{state} = DATA_STATE;
2249 wakaba 1.5 $self->{s_kwd} = '';
2250 wakaba 1.1 ## Reconsume.
2251     return ($self->{ct}); # start tag or end tag
2252     redo A;
2253     } else {
2254    
2255     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2256     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2257     ## reconsume
2258     redo A;
2259     }
2260     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2261 wakaba 1.11 ## XML5: "Empty tag state".
2262    
2263 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2264     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2265    
2266     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2267     ## TODO: Different type than slash in start tag
2268     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2269     if ($self->{ct}->{attributes}) {
2270    
2271     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2272     } else {
2273    
2274     }
2275     ## TODO: Test |<title></title/>|
2276     } else {
2277    
2278     $self->{self_closing} = 1;
2279     }
2280    
2281     $self->{state} = DATA_STATE;
2282 wakaba 1.5 $self->{s_kwd} = '';
2283 wakaba 1.1
2284     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2285     $self->{line_prev} = $self->{line};
2286     $self->{column_prev} = $self->{column};
2287     $self->{column}++;
2288     $self->{nc}
2289     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2290     } else {
2291     $self->{set_nc}->($self);
2292     }
2293    
2294    
2295     return ($self->{ct}); # start tag or end tag
2296    
2297     redo A;
2298     } elsif ($self->{nc} == -1) {
2299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2300     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2301    
2302     $self->{last_stag_name} = $self->{ct}->{tag_name};
2303     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2304     if ($self->{ct}->{attributes}) {
2305    
2306     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2307     } else {
2308     ## NOTE: This state should never be reached.
2309    
2310     }
2311     } else {
2312     die "$0: $self->{ct}->{type}: Unknown token type";
2313     }
2314 wakaba 1.11 ## XML5: "Tag attribute name before state".
2315 wakaba 1.1 $self->{state} = DATA_STATE;
2316 wakaba 1.5 $self->{s_kwd} = '';
2317 wakaba 1.1 ## Reconsume.
2318     return ($self->{ct}); # start tag or end tag
2319     redo A;
2320     } else {
2321    
2322     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2323     ## TODO: This error type is wrong.
2324     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2325     ## Reconsume.
2326     redo A;
2327     }
2328     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2329 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2330    
2331 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2332     ## consumes characters one-by-one basis.
2333    
2334     if ($self->{nc} == 0x003E) { # >
2335 wakaba 1.13 if ($self->{in_subset}) {
2336    
2337     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2338     } else {
2339    
2340     $self->{state} = DATA_STATE;
2341     $self->{s_kwd} = '';
2342     }
2343 wakaba 1.1
2344     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2345     $self->{line_prev} = $self->{line};
2346     $self->{column_prev} = $self->{column};
2347     $self->{column}++;
2348     $self->{nc}
2349     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2350     } else {
2351     $self->{set_nc}->($self);
2352     }
2353    
2354    
2355     return ($self->{ct}); # comment
2356     redo A;
2357     } elsif ($self->{nc} == -1) {
2358 wakaba 1.13 if ($self->{in_subset}) {
2359    
2360     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2361     } else {
2362    
2363     $self->{state} = DATA_STATE;
2364     $self->{s_kwd} = '';
2365     }
2366 wakaba 1.1 ## reconsume
2367    
2368     return ($self->{ct}); # comment
2369     redo A;
2370     } else {
2371    
2372     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2373     $self->{read_until}->($self->{ct}->{data},
2374     q[>],
2375     length $self->{ct}->{data});
2376    
2377     ## Stay in the state.
2378    
2379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2380     $self->{line_prev} = $self->{line};
2381     $self->{column_prev} = $self->{column};
2382     $self->{column}++;
2383     $self->{nc}
2384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2385     } else {
2386     $self->{set_nc}->($self);
2387     }
2388    
2389     redo A;
2390     }
2391     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2392 wakaba 1.14 ## XML5: "Markup declaration state".
2393 wakaba 1.1
2394     if ($self->{nc} == 0x002D) { # -
2395    
2396     $self->{state} = MD_HYPHEN_STATE;
2397    
2398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2399     $self->{line_prev} = $self->{line};
2400     $self->{column_prev} = $self->{column};
2401     $self->{column}++;
2402     $self->{nc}
2403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2404     } else {
2405     $self->{set_nc}->($self);
2406     }
2407    
2408     redo A;
2409     } elsif ($self->{nc} == 0x0044 or # D
2410     $self->{nc} == 0x0064) { # d
2411     ## ASCII case-insensitive.
2412    
2413     $self->{state} = MD_DOCTYPE_STATE;
2414 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2415 wakaba 1.1
2416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2417     $self->{line_prev} = $self->{line};
2418     $self->{column_prev} = $self->{column};
2419     $self->{column}++;
2420     $self->{nc}
2421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2422     } else {
2423     $self->{set_nc}->($self);
2424     }
2425    
2426     redo A;
2427 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2428     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2429     $self->{is_xml}) and
2430 wakaba 1.1 $self->{nc} == 0x005B) { # [
2431    
2432     $self->{state} = MD_CDATA_STATE;
2433 wakaba 1.12 $self->{kwd} = '[';
2434 wakaba 1.1
2435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2436     $self->{line_prev} = $self->{line};
2437     $self->{column_prev} = $self->{column};
2438     $self->{column}++;
2439     $self->{nc}
2440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2441     } else {
2442     $self->{set_nc}->($self);
2443     }
2444    
2445     redo A;
2446     } else {
2447    
2448     }
2449    
2450     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2451     line => $self->{line_prev},
2452     column => $self->{column_prev} - 1);
2453     ## Reconsume.
2454     $self->{state} = BOGUS_COMMENT_STATE;
2455     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2456     line => $self->{line_prev},
2457     column => $self->{column_prev} - 1,
2458     };
2459     redo A;
2460     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2461     if ($self->{nc} == 0x002D) { # -
2462    
2463     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2464     line => $self->{line_prev},
2465     column => $self->{column_prev} - 2,
2466     };
2467 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2468 wakaba 1.1
2469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470     $self->{line_prev} = $self->{line};
2471     $self->{column_prev} = $self->{column};
2472     $self->{column}++;
2473     $self->{nc}
2474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475     } else {
2476     $self->{set_nc}->($self);
2477     }
2478    
2479     redo A;
2480     } else {
2481    
2482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2483     line => $self->{line_prev},
2484     column => $self->{column_prev} - 2);
2485     $self->{state} = BOGUS_COMMENT_STATE;
2486     ## Reconsume.
2487     $self->{ct} = {type => COMMENT_TOKEN,
2488     data => '-',
2489     line => $self->{line_prev},
2490     column => $self->{column_prev} - 2,
2491     };
2492     redo A;
2493     }
2494     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2495     ## ASCII case-insensitive.
2496     if ($self->{nc} == [
2497     undef,
2498     0x004F, # O
2499     0x0043, # C
2500     0x0054, # T
2501     0x0059, # Y
2502     0x0050, # P
2503 wakaba 1.12 ]->[length $self->{kwd}] or
2504 wakaba 1.1 $self->{nc} == [
2505     undef,
2506     0x006F, # o
2507     0x0063, # c
2508     0x0074, # t
2509     0x0079, # y
2510     0x0070, # p
2511 wakaba 1.12 ]->[length $self->{kwd}]) {
2512 wakaba 1.1
2513     ## Stay in the state.
2514 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2515 wakaba 1.1
2516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2517     $self->{line_prev} = $self->{line};
2518     $self->{column_prev} = $self->{column};
2519     $self->{column}++;
2520     $self->{nc}
2521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2522     } else {
2523     $self->{set_nc}->($self);
2524     }
2525    
2526     redo A;
2527 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2528 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2529     $self->{nc} == 0x0065)) { # e
2530 wakaba 1.12 if ($self->{is_xml} and
2531     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2532 wakaba 1.10
2533     ## XML5: case-sensitive.
2534     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2535     text => 'DOCTYPE',
2536     line => $self->{line_prev},
2537     column => $self->{column_prev} - 5);
2538     } else {
2539    
2540     }
2541 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2542     $self->{ct} = {type => DOCTYPE_TOKEN,
2543     quirks => 1,
2544     line => $self->{line_prev},
2545     column => $self->{column_prev} - 7,
2546     };
2547    
2548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2549     $self->{line_prev} = $self->{line};
2550     $self->{column_prev} = $self->{column};
2551     $self->{column}++;
2552     $self->{nc}
2553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2554     } else {
2555     $self->{set_nc}->($self);
2556     }
2557    
2558     redo A;
2559     } else {
2560    
2561     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2562     line => $self->{line_prev},
2563 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2564 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2565     ## Reconsume.
2566     $self->{ct} = {type => COMMENT_TOKEN,
2567 wakaba 1.12 data => $self->{kwd},
2568 wakaba 1.1 line => $self->{line_prev},
2569 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2570 wakaba 1.1 };
2571     redo A;
2572     }
2573     } elsif ($self->{state} == MD_CDATA_STATE) {
2574     if ($self->{nc} == {
2575     '[' => 0x0043, # C
2576     '[C' => 0x0044, # D
2577     '[CD' => 0x0041, # A
2578     '[CDA' => 0x0054, # T
2579     '[CDAT' => 0x0041, # A
2580 wakaba 1.12 }->{$self->{kwd}}) {
2581 wakaba 1.1
2582     ## Stay in the state.
2583 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2584 wakaba 1.1
2585     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2586     $self->{line_prev} = $self->{line};
2587     $self->{column_prev} = $self->{column};
2588     $self->{column}++;
2589     $self->{nc}
2590     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2591     } else {
2592     $self->{set_nc}->($self);
2593     }
2594    
2595     redo A;
2596 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2597 wakaba 1.1 $self->{nc} == 0x005B) { # [
2598 wakaba 1.6 if ($self->{is_xml} and
2599     not $self->{tainted} and
2600     @{$self->{open_elements} or []} == 0) {
2601 wakaba 1.8
2602 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2603     line => $self->{line_prev},
2604     column => $self->{column_prev} - 7);
2605     $self->{tainted} = 1;
2606 wakaba 1.8 } else {
2607    
2608 wakaba 1.6 }
2609    
2610 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2611     data => '',
2612     line => $self->{line_prev},
2613     column => $self->{column_prev} - 7};
2614     $self->{state} = CDATA_SECTION_STATE;
2615    
2616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2617     $self->{line_prev} = $self->{line};
2618     $self->{column_prev} = $self->{column};
2619     $self->{column}++;
2620     $self->{nc}
2621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2622     } else {
2623     $self->{set_nc}->($self);
2624     }
2625    
2626     redo A;
2627     } else {
2628    
2629     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2630     line => $self->{line_prev},
2631 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2632 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2633     ## Reconsume.
2634     $self->{ct} = {type => COMMENT_TOKEN,
2635 wakaba 1.12 data => $self->{kwd},
2636 wakaba 1.1 line => $self->{line_prev},
2637 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2638 wakaba 1.1 };
2639     redo A;
2640     }
2641     } elsif ($self->{state} == COMMENT_START_STATE) {
2642     if ($self->{nc} == 0x002D) { # -
2643    
2644     $self->{state} = COMMENT_START_DASH_STATE;
2645    
2646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2647     $self->{line_prev} = $self->{line};
2648     $self->{column_prev} = $self->{column};
2649     $self->{column}++;
2650     $self->{nc}
2651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2652     } else {
2653     $self->{set_nc}->($self);
2654     }
2655    
2656     redo A;
2657     } elsif ($self->{nc} == 0x003E) { # >
2658     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2659 wakaba 1.13 if ($self->{in_subset}) {
2660    
2661     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2662     } else {
2663    
2664     $self->{state} = DATA_STATE;
2665     $self->{s_kwd} = '';
2666     }
2667 wakaba 1.1
2668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2669     $self->{line_prev} = $self->{line};
2670     $self->{column_prev} = $self->{column};
2671     $self->{column}++;
2672     $self->{nc}
2673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2674     } else {
2675     $self->{set_nc}->($self);
2676     }
2677    
2678    
2679     return ($self->{ct}); # comment
2680    
2681     redo A;
2682     } elsif ($self->{nc} == -1) {
2683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2684 wakaba 1.13 if ($self->{in_subset}) {
2685    
2686     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2687     } else {
2688    
2689     $self->{state} = DATA_STATE;
2690     $self->{s_kwd} = '';
2691     }
2692 wakaba 1.1 ## reconsume
2693    
2694     return ($self->{ct}); # comment
2695    
2696     redo A;
2697     } else {
2698    
2699     $self->{ct}->{data} # comment
2700     .= chr ($self->{nc});
2701     $self->{state} = COMMENT_STATE;
2702    
2703     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2704     $self->{line_prev} = $self->{line};
2705     $self->{column_prev} = $self->{column};
2706     $self->{column}++;
2707     $self->{nc}
2708     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2709     } else {
2710     $self->{set_nc}->($self);
2711     }
2712    
2713     redo A;
2714     }
2715     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2716     if ($self->{nc} == 0x002D) { # -
2717    
2718     $self->{state} = COMMENT_END_STATE;
2719    
2720     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2721     $self->{line_prev} = $self->{line};
2722     $self->{column_prev} = $self->{column};
2723     $self->{column}++;
2724     $self->{nc}
2725     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2726     } else {
2727     $self->{set_nc}->($self);
2728     }
2729    
2730     redo A;
2731     } elsif ($self->{nc} == 0x003E) { # >
2732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2733 wakaba 1.13 if ($self->{in_subset}) {
2734    
2735     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2736     } else {
2737    
2738     $self->{state} = DATA_STATE;
2739     $self->{s_kwd} = '';
2740     }
2741 wakaba 1.1
2742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2743     $self->{line_prev} = $self->{line};
2744     $self->{column_prev} = $self->{column};
2745     $self->{column}++;
2746     $self->{nc}
2747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2748     } else {
2749     $self->{set_nc}->($self);
2750     }
2751    
2752    
2753     return ($self->{ct}); # comment
2754    
2755     redo A;
2756     } elsif ($self->{nc} == -1) {
2757     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2758 wakaba 1.13 if ($self->{in_subset}) {
2759    
2760     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2761     } else {
2762    
2763     $self->{state} = DATA_STATE;
2764     $self->{s_kwd} = '';
2765     }
2766 wakaba 1.1 ## reconsume
2767    
2768     return ($self->{ct}); # comment
2769    
2770     redo A;
2771     } else {
2772    
2773     $self->{ct}->{data} # comment
2774     .= '-' . chr ($self->{nc});
2775     $self->{state} = COMMENT_STATE;
2776    
2777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2778     $self->{line_prev} = $self->{line};
2779     $self->{column_prev} = $self->{column};
2780     $self->{column}++;
2781     $self->{nc}
2782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2783     } else {
2784     $self->{set_nc}->($self);
2785     }
2786    
2787     redo A;
2788     }
2789     } elsif ($self->{state} == COMMENT_STATE) {
2790 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2791    
2792 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2793    
2794     $self->{state} = COMMENT_END_DASH_STATE;
2795    
2796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2797     $self->{line_prev} = $self->{line};
2798     $self->{column_prev} = $self->{column};
2799     $self->{column}++;
2800     $self->{nc}
2801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2802     } else {
2803     $self->{set_nc}->($self);
2804     }
2805    
2806     redo A;
2807     } elsif ($self->{nc} == -1) {
2808     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2809 wakaba 1.13 if ($self->{in_subset}) {
2810    
2811     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2812     } else {
2813    
2814     $self->{state} = DATA_STATE;
2815     $self->{s_kwd} = '';
2816     }
2817 wakaba 1.1 ## reconsume
2818    
2819     return ($self->{ct}); # comment
2820    
2821     redo A;
2822     } else {
2823    
2824     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2825     $self->{read_until}->($self->{ct}->{data},
2826     q[-],
2827     length $self->{ct}->{data});
2828    
2829     ## Stay in the state
2830    
2831     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832     $self->{line_prev} = $self->{line};
2833     $self->{column_prev} = $self->{column};
2834     $self->{column}++;
2835     $self->{nc}
2836     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2837     } else {
2838     $self->{set_nc}->($self);
2839     }
2840    
2841     redo A;
2842     }
2843     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2844 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2845 wakaba 1.10
2846 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2847    
2848     $self->{state} = COMMENT_END_STATE;
2849    
2850     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2851     $self->{line_prev} = $self->{line};
2852     $self->{column_prev} = $self->{column};
2853     $self->{column}++;
2854     $self->{nc}
2855     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2856     } else {
2857     $self->{set_nc}->($self);
2858     }
2859    
2860     redo A;
2861     } elsif ($self->{nc} == -1) {
2862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2863 wakaba 1.13 if ($self->{in_subset}) {
2864    
2865     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2866     } else {
2867    
2868     $self->{state} = DATA_STATE;
2869     $self->{s_kwd} = '';
2870     }
2871 wakaba 1.1 ## reconsume
2872    
2873     return ($self->{ct}); # comment
2874    
2875     redo A;
2876     } else {
2877    
2878     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2879     $self->{state} = COMMENT_STATE;
2880    
2881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2882     $self->{line_prev} = $self->{line};
2883     $self->{column_prev} = $self->{column};
2884     $self->{column}++;
2885     $self->{nc}
2886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2887     } else {
2888     $self->{set_nc}->($self);
2889     }
2890    
2891     redo A;
2892     }
2893     } elsif ($self->{state} == COMMENT_END_STATE) {
2894 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2895    
2896 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2897 wakaba 1.13 if ($self->{in_subset}) {
2898    
2899     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2900     } else {
2901    
2902     $self->{state} = DATA_STATE;
2903     $self->{s_kwd} = '';
2904     }
2905 wakaba 1.1
2906     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2907     $self->{line_prev} = $self->{line};
2908     $self->{column_prev} = $self->{column};
2909     $self->{column}++;
2910     $self->{nc}
2911     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2912     } else {
2913     $self->{set_nc}->($self);
2914     }
2915    
2916    
2917     return ($self->{ct}); # comment
2918    
2919     redo A;
2920     } elsif ($self->{nc} == 0x002D) { # -
2921    
2922 wakaba 1.10 ## XML5: Not a parse error.
2923 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2924     line => $self->{line_prev},
2925     column => $self->{column_prev});
2926     $self->{ct}->{data} .= '-'; # comment
2927     ## Stay in the state
2928    
2929     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2930     $self->{line_prev} = $self->{line};
2931     $self->{column_prev} = $self->{column};
2932     $self->{column}++;
2933     $self->{nc}
2934     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2935     } else {
2936     $self->{set_nc}->($self);
2937     }
2938    
2939     redo A;
2940     } elsif ($self->{nc} == -1) {
2941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2942 wakaba 1.13 if ($self->{in_subset}) {
2943    
2944     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945     } else {
2946    
2947     $self->{state} = DATA_STATE;
2948     $self->{s_kwd} = '';
2949     }
2950 wakaba 1.1 ## reconsume
2951    
2952     return ($self->{ct}); # comment
2953    
2954     redo A;
2955     } else {
2956    
2957 wakaba 1.10 ## XML5: Not a parse error.
2958 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2959     line => $self->{line_prev},
2960     column => $self->{column_prev});
2961     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2962     $self->{state} = COMMENT_STATE;
2963    
2964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2965     $self->{line_prev} = $self->{line};
2966     $self->{column_prev} = $self->{column};
2967     $self->{column}++;
2968     $self->{nc}
2969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2970     } else {
2971     $self->{set_nc}->($self);
2972     }
2973    
2974     redo A;
2975     }
2976     } elsif ($self->{state} == DOCTYPE_STATE) {
2977     if ($is_space->{$self->{nc}}) {
2978    
2979     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2980    
2981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2982     $self->{line_prev} = $self->{line};
2983     $self->{column_prev} = $self->{column};
2984     $self->{column}++;
2985     $self->{nc}
2986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2987     } else {
2988     $self->{set_nc}->($self);
2989     }
2990    
2991     redo A;
2992     } else {
2993    
2994 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2995 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2996     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2997     ## reconsume
2998     redo A;
2999     }
3000     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3001 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3002    
3003 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3004    
3005     ## Stay in the state
3006    
3007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3008     $self->{line_prev} = $self->{line};
3009     $self->{column_prev} = $self->{column};
3010     $self->{column}++;
3011     $self->{nc}
3012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3013     } else {
3014     $self->{set_nc}->($self);
3015     }
3016    
3017     redo A;
3018     } elsif ($self->{nc} == 0x003E) { # >
3019    
3020 wakaba 1.12 ## XML5: No parse error.
3021 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3022     $self->{state} = DATA_STATE;
3023 wakaba 1.5 $self->{s_kwd} = '';
3024 wakaba 1.1
3025     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3026     $self->{line_prev} = $self->{line};
3027     $self->{column_prev} = $self->{column};
3028     $self->{column}++;
3029     $self->{nc}
3030     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3031     } else {
3032     $self->{set_nc}->($self);
3033     }
3034    
3035    
3036     return ($self->{ct}); # DOCTYPE (quirks)
3037    
3038     redo A;
3039     } elsif ($self->{nc} == -1) {
3040    
3041     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3042     $self->{state} = DATA_STATE;
3043 wakaba 1.5 $self->{s_kwd} = '';
3044 wakaba 1.1 ## reconsume
3045    
3046     return ($self->{ct}); # DOCTYPE (quirks)
3047    
3048     redo A;
3049 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3050    
3051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3053 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3054     $self->{in_subset} = 1;
3055 wakaba 1.12
3056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3057     $self->{line_prev} = $self->{line};
3058     $self->{column_prev} = $self->{column};
3059     $self->{column}++;
3060     $self->{nc}
3061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3062     } else {
3063     $self->{set_nc}->($self);
3064     }
3065    
3066 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3067 wakaba 1.12 redo A;
3068 wakaba 1.1 } else {
3069    
3070     $self->{ct}->{name} = chr $self->{nc};
3071     delete $self->{ct}->{quirks};
3072     $self->{state} = DOCTYPE_NAME_STATE;
3073    
3074     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3075     $self->{line_prev} = $self->{line};
3076     $self->{column_prev} = $self->{column};
3077     $self->{column}++;
3078     $self->{nc}
3079     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3080     } else {
3081     $self->{set_nc}->($self);
3082     }
3083    
3084     redo A;
3085     }
3086     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3087 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3088    
3089     ## ISSUE: Redundant "First," in the spec.
3090    
3091 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3092    
3093     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3094    
3095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3096     $self->{line_prev} = $self->{line};
3097     $self->{column_prev} = $self->{column};
3098     $self->{column}++;
3099     $self->{nc}
3100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3101     } else {
3102     $self->{set_nc}->($self);
3103     }
3104    
3105     redo A;
3106     } elsif ($self->{nc} == 0x003E) { # >
3107    
3108     $self->{state} = DATA_STATE;
3109 wakaba 1.5 $self->{s_kwd} = '';
3110 wakaba 1.1
3111     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3112     $self->{line_prev} = $self->{line};
3113     $self->{column_prev} = $self->{column};
3114     $self->{column}++;
3115     $self->{nc}
3116     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3117     } else {
3118     $self->{set_nc}->($self);
3119     }
3120    
3121    
3122     return ($self->{ct}); # DOCTYPE
3123    
3124     redo A;
3125     } elsif ($self->{nc} == -1) {
3126    
3127     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3128     $self->{state} = DATA_STATE;
3129 wakaba 1.5 $self->{s_kwd} = '';
3130 wakaba 1.1 ## reconsume
3131    
3132     $self->{ct}->{quirks} = 1;
3133     return ($self->{ct}); # DOCTYPE
3134    
3135     redo A;
3136 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3137    
3138     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3139 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3140     $self->{in_subset} = 1;
3141 wakaba 1.12
3142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3143     $self->{line_prev} = $self->{line};
3144     $self->{column_prev} = $self->{column};
3145     $self->{column}++;
3146     $self->{nc}
3147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3148     } else {
3149     $self->{set_nc}->($self);
3150     }
3151    
3152 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3153 wakaba 1.12 redo A;
3154 wakaba 1.1 } else {
3155    
3156     $self->{ct}->{name}
3157     .= chr ($self->{nc}); # DOCTYPE
3158     ## Stay in the state
3159    
3160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3161     $self->{line_prev} = $self->{line};
3162     $self->{column_prev} = $self->{column};
3163     $self->{column}++;
3164     $self->{nc}
3165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3166     } else {
3167     $self->{set_nc}->($self);
3168     }
3169    
3170     redo A;
3171     }
3172     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3173 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3174     ## state", but implemented differently.
3175    
3176 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3177    
3178     ## Stay in the state
3179    
3180     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3181     $self->{line_prev} = $self->{line};
3182     $self->{column_prev} = $self->{column};
3183     $self->{column}++;
3184     $self->{nc}
3185     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3186     } else {
3187     $self->{set_nc}->($self);
3188     }
3189    
3190     redo A;
3191     } elsif ($self->{nc} == 0x003E) { # >
3192 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3193    
3194     $self->{state} = DATA_STATE;
3195     $self->{s_kwd} = '';
3196     } else {
3197    
3198     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3199     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3200     }
3201 wakaba 1.1
3202    
3203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3204     $self->{line_prev} = $self->{line};
3205     $self->{column_prev} = $self->{column};
3206     $self->{column}++;
3207     $self->{nc}
3208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3209     } else {
3210     $self->{set_nc}->($self);
3211     }
3212    
3213 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3214 wakaba 1.1 redo A;
3215     } elsif ($self->{nc} == -1) {
3216 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3217    
3218     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3219     $self->{state} = DATA_STATE;
3220     $self->{s_kwd} = '';
3221     $self->{ct}->{quirks} = 1;
3222     } else {
3223    
3224     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3226     }
3227 wakaba 1.1
3228 wakaba 1.16 ## Reconsume.
3229     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3230 wakaba 1.1 redo A;
3231     } elsif ($self->{nc} == 0x0050 or # P
3232     $self->{nc} == 0x0070) { # p
3233 wakaba 1.12
3234 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3235 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3236 wakaba 1.1
3237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238     $self->{line_prev} = $self->{line};
3239     $self->{column_prev} = $self->{column};
3240     $self->{column}++;
3241     $self->{nc}
3242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3243     } else {
3244     $self->{set_nc}->($self);
3245     }
3246    
3247     redo A;
3248     } elsif ($self->{nc} == 0x0053 or # S
3249     $self->{nc} == 0x0073) { # s
3250 wakaba 1.12
3251 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3252 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3253    
3254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3255     $self->{line_prev} = $self->{line};
3256     $self->{column_prev} = $self->{column};
3257     $self->{column}++;
3258     $self->{nc}
3259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3260     } else {
3261     $self->{set_nc}->($self);
3262     }
3263    
3264     redo A;
3265 wakaba 1.16 ## TODO: " and ' for ENTITY
3266     } elsif ($self->{is_xml} and
3267     $self->{ct}->{type} == DOCTYPE_TOKEN and
3268     $self->{nc} == 0x005B) { # [
3269 wakaba 1.12
3270     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3271     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3272 wakaba 1.13 $self->{in_subset} = 1;
3273 wakaba 1.1
3274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275     $self->{line_prev} = $self->{line};
3276     $self->{column_prev} = $self->{column};
3277     $self->{column}++;
3278     $self->{nc}
3279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280     } else {
3281     $self->{set_nc}->($self);
3282     }
3283    
3284 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3285 wakaba 1.1 redo A;
3286     } else {
3287 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3288    
3289     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3290    
3291     $self->{ct}->{quirks} = 1;
3292     $self->{state} = BOGUS_DOCTYPE_STATE;
3293     } else {
3294    
3295     $self->{state} = BOGUS_MD_STATE;
3296     }
3297 wakaba 1.1
3298    
3299     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3300     $self->{line_prev} = $self->{line};
3301     $self->{column_prev} = $self->{column};
3302     $self->{column}++;
3303     $self->{nc}
3304     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3305     } else {
3306     $self->{set_nc}->($self);
3307     }
3308    
3309     redo A;
3310     }
3311     } elsif ($self->{state} == PUBLIC_STATE) {
3312     ## ASCII case-insensitive
3313     if ($self->{nc} == [
3314     undef,
3315     0x0055, # U
3316     0x0042, # B
3317     0x004C, # L
3318     0x0049, # I
3319 wakaba 1.12 ]->[length $self->{kwd}] or
3320 wakaba 1.1 $self->{nc} == [
3321     undef,
3322     0x0075, # u
3323     0x0062, # b
3324     0x006C, # l
3325     0x0069, # i
3326 wakaba 1.12 ]->[length $self->{kwd}]) {
3327 wakaba 1.1
3328     ## Stay in the state.
3329 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3330 wakaba 1.1
3331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332     $self->{line_prev} = $self->{line};
3333     $self->{column_prev} = $self->{column};
3334     $self->{column}++;
3335     $self->{nc}
3336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3337     } else {
3338     $self->{set_nc}->($self);
3339     }
3340    
3341     redo A;
3342 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3343 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3344     $self->{nc} == 0x0063)) { # c
3345 wakaba 1.12 if ($self->{is_xml} and
3346     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3347    
3348     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3349     text => 'PUBLIC',
3350     line => $self->{line_prev},
3351     column => $self->{column_prev} - 4);
3352     } else {
3353    
3354     }
3355 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3356    
3357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3358     $self->{line_prev} = $self->{line};
3359     $self->{column_prev} = $self->{column};
3360     $self->{column}++;
3361     $self->{nc}
3362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3363     } else {
3364     $self->{set_nc}->($self);
3365     }
3366    
3367     redo A;
3368     } else {
3369 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3370 wakaba 1.1 line => $self->{line_prev},
3371 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3372 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3373    
3374     $self->{ct}->{quirks} = 1;
3375     $self->{state} = BOGUS_DOCTYPE_STATE;
3376     } else {
3377    
3378     $self->{state} = BOGUS_MD_STATE;
3379     }
3380 wakaba 1.1 ## Reconsume.
3381     redo A;
3382     }
3383     } elsif ($self->{state} == SYSTEM_STATE) {
3384     ## ASCII case-insensitive
3385     if ($self->{nc} == [
3386     undef,
3387     0x0059, # Y
3388     0x0053, # S
3389     0x0054, # T
3390     0x0045, # E
3391 wakaba 1.12 ]->[length $self->{kwd}] or
3392 wakaba 1.1 $self->{nc} == [
3393     undef,
3394     0x0079, # y
3395     0x0073, # s
3396     0x0074, # t
3397     0x0065, # e
3398 wakaba 1.12 ]->[length $self->{kwd}]) {
3399 wakaba 1.1
3400     ## Stay in the state.
3401 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3402 wakaba 1.1
3403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3404     $self->{line_prev} = $self->{line};
3405     $self->{column_prev} = $self->{column};
3406     $self->{column}++;
3407     $self->{nc}
3408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3409     } else {
3410     $self->{set_nc}->($self);
3411     }
3412    
3413     redo A;
3414 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3415 wakaba 1.1 ($self->{nc} == 0x004D or # M
3416     $self->{nc} == 0x006D)) { # m
3417 wakaba 1.12 if ($self->{is_xml} and
3418     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3419    
3420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3421     text => 'SYSTEM',
3422     line => $self->{line_prev},
3423     column => $self->{column_prev} - 4);
3424     } else {
3425    
3426     }
3427 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3428    
3429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3430     $self->{line_prev} = $self->{line};
3431     $self->{column_prev} = $self->{column};
3432     $self->{column}++;
3433     $self->{nc}
3434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3435     } else {
3436     $self->{set_nc}->($self);
3437     }
3438    
3439     redo A;
3440     } else {
3441 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3442 wakaba 1.1 line => $self->{line_prev},
3443 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3444 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445    
3446     $self->{ct}->{quirks} = 1;
3447     $self->{state} = BOGUS_DOCTYPE_STATE;
3448     } else {
3449    
3450     $self->{state} = BOGUS_MD_STATE;
3451     }
3452 wakaba 1.1 ## Reconsume.
3453     redo A;
3454     }
3455     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3456     if ($is_space->{$self->{nc}}) {
3457    
3458     ## Stay in the state
3459    
3460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3461     $self->{line_prev} = $self->{line};
3462     $self->{column_prev} = $self->{column};
3463     $self->{column}++;
3464     $self->{nc}
3465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3466     } else {
3467     $self->{set_nc}->($self);
3468     }
3469    
3470     redo A;
3471     } elsif ($self->{nc} eq 0x0022) { # "
3472    
3473     $self->{ct}->{pubid} = ''; # DOCTYPE
3474     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3475    
3476     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3477     $self->{line_prev} = $self->{line};
3478     $self->{column_prev} = $self->{column};
3479     $self->{column}++;
3480     $self->{nc}
3481     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3482     } else {
3483     $self->{set_nc}->($self);
3484     }
3485    
3486     redo A;
3487     } elsif ($self->{nc} eq 0x0027) { # '
3488    
3489     $self->{ct}->{pubid} = ''; # DOCTYPE
3490     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3491    
3492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3493     $self->{line_prev} = $self->{line};
3494     $self->{column_prev} = $self->{column};
3495     $self->{column}++;
3496     $self->{nc}
3497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3498     } else {
3499     $self->{set_nc}->($self);
3500     }
3501    
3502     redo A;
3503     } elsif ($self->{nc} eq 0x003E) { # >
3504 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3505    
3506     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3507    
3508     $self->{state} = DATA_STATE;
3509     $self->{s_kwd} = '';
3510     $self->{ct}->{quirks} = 1;
3511     } else {
3512    
3513     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3514     }
3515 wakaba 1.1
3516    
3517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3518     $self->{line_prev} = $self->{line};
3519     $self->{column_prev} = $self->{column};
3520     $self->{column}++;
3521     $self->{nc}
3522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3523     } else {
3524     $self->{set_nc}->($self);
3525     }
3526    
3527 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3528 wakaba 1.1 redo A;
3529     } elsif ($self->{nc} == -1) {
3530 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3531    
3532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3533     $self->{state} = DATA_STATE;
3534     $self->{s_kwd} = '';
3535     $self->{ct}->{quirks} = 1;
3536     } else {
3537    
3538     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3539     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3540     }
3541 wakaba 1.1
3542     ## reconsume
3543     return ($self->{ct}); # DOCTYPE
3544     redo A;
3545 wakaba 1.16 } elsif ($self->{is_xml} and
3546     $self->{ct}->{type} == DOCTYPE_TOKEN and
3547     $self->{nc} == 0x005B) { # [
3548 wakaba 1.12
3549     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3551     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3552 wakaba 1.13 $self->{in_subset} = 1;
3553 wakaba 1.12
3554     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3555     $self->{line_prev} = $self->{line};
3556     $self->{column_prev} = $self->{column};
3557     $self->{column}++;
3558     $self->{nc}
3559     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3560     } else {
3561     $self->{set_nc}->($self);
3562     }
3563    
3564 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3565 wakaba 1.12 redo A;
3566 wakaba 1.1 } else {
3567     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3568    
3569 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3570    
3571     $self->{ct}->{quirks} = 1;
3572     $self->{state} = BOGUS_DOCTYPE_STATE;
3573     } else {
3574    
3575     $self->{state} = BOGUS_MD_STATE;
3576     }
3577    
3578 wakaba 1.1
3579     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3580     $self->{line_prev} = $self->{line};
3581     $self->{column_prev} = $self->{column};
3582     $self->{column}++;
3583     $self->{nc}
3584     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3585     } else {
3586     $self->{set_nc}->($self);
3587     }
3588    
3589     redo A;
3590     }
3591     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3592     if ($self->{nc} == 0x0022) { # "
3593    
3594     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3595    
3596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597     $self->{line_prev} = $self->{line};
3598     $self->{column_prev} = $self->{column};
3599     $self->{column}++;
3600     $self->{nc}
3601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3602     } else {
3603     $self->{set_nc}->($self);
3604     }
3605    
3606     redo A;
3607     } elsif ($self->{nc} == 0x003E) { # >
3608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3609    
3610 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611    
3612     $self->{state} = DATA_STATE;
3613     $self->{s_kwd} = '';
3614     $self->{ct}->{quirks} = 1;
3615     } else {
3616    
3617     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3618     }
3619    
3620 wakaba 1.1
3621     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3622     $self->{line_prev} = $self->{line};
3623     $self->{column_prev} = $self->{column};
3624     $self->{column}++;
3625     $self->{nc}
3626     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3627     } else {
3628     $self->{set_nc}->($self);
3629     }
3630    
3631 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3632 wakaba 1.1 redo A;
3633     } elsif ($self->{nc} == -1) {
3634     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3635    
3636 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3637    
3638     $self->{state} = DATA_STATE;
3639     $self->{s_kwd} = '';
3640     $self->{ct}->{quirks} = 1;
3641     } else {
3642    
3643     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3644     }
3645    
3646     ## Reconsume.
3647 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3648     redo A;
3649     } else {
3650    
3651 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3652 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3653     length $self->{ct}->{pubid});
3654    
3655     ## Stay in the state
3656    
3657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3658     $self->{line_prev} = $self->{line};
3659     $self->{column_prev} = $self->{column};
3660     $self->{column}++;
3661     $self->{nc}
3662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3663     } else {
3664     $self->{set_nc}->($self);
3665     }
3666    
3667     redo A;
3668     }
3669     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3670     if ($self->{nc} == 0x0027) { # '
3671    
3672     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3673    
3674     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3675     $self->{line_prev} = $self->{line};
3676     $self->{column_prev} = $self->{column};
3677     $self->{column}++;
3678     $self->{nc}
3679     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3680     } else {
3681     $self->{set_nc}->($self);
3682     }
3683    
3684     redo A;
3685     } elsif ($self->{nc} == 0x003E) { # >
3686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3687    
3688 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3689    
3690     $self->{state} = DATA_STATE;
3691     $self->{s_kwd} = '';
3692     $self->{ct}->{quirks} = 1;
3693     } else {
3694    
3695     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3696     }
3697    
3698 wakaba 1.1
3699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3700     $self->{line_prev} = $self->{line};
3701     $self->{column_prev} = $self->{column};
3702     $self->{column}++;
3703     $self->{nc}
3704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3705     } else {
3706     $self->{set_nc}->($self);
3707     }
3708    
3709 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3710 wakaba 1.1 redo A;
3711     } elsif ($self->{nc} == -1) {
3712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3713    
3714 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3715    
3716     $self->{state} = DATA_STATE;
3717     $self->{s_kwd} = '';
3718     $self->{ct}->{quirks} = 1;
3719     } else {
3720    
3721     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3722     }
3723    
3724 wakaba 1.1 ## reconsume
3725 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3726 wakaba 1.1 redo A;
3727     } else {
3728    
3729 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3730 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3731     length $self->{ct}->{pubid});
3732    
3733     ## Stay in the state
3734    
3735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3736     $self->{line_prev} = $self->{line};
3737     $self->{column_prev} = $self->{column};
3738     $self->{column}++;
3739     $self->{nc}
3740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3741     } else {
3742     $self->{set_nc}->($self);
3743     }
3744    
3745     redo A;
3746     }
3747     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3748     if ($is_space->{$self->{nc}}) {
3749    
3750     ## Stay in the state
3751    
3752     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3753     $self->{line_prev} = $self->{line};
3754     $self->{column_prev} = $self->{column};
3755     $self->{column}++;
3756     $self->{nc}
3757     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3758     } else {
3759     $self->{set_nc}->($self);
3760     }
3761    
3762     redo A;
3763     } elsif ($self->{nc} == 0x0022) { # "
3764    
3765 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3766 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3767    
3768     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3769     $self->{line_prev} = $self->{line};
3770     $self->{column_prev} = $self->{column};
3771     $self->{column}++;
3772     $self->{nc}
3773     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3774     } else {
3775     $self->{set_nc}->($self);
3776     }
3777    
3778     redo A;
3779     } elsif ($self->{nc} == 0x0027) { # '
3780    
3781 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3782 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3783    
3784     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3785     $self->{line_prev} = $self->{line};
3786     $self->{column_prev} = $self->{column};
3787     $self->{column}++;
3788     $self->{nc}
3789     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3790     } else {
3791     $self->{set_nc}->($self);
3792     }
3793    
3794     redo A;
3795     } elsif ($self->{nc} == 0x003E) { # >
3796 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3797     if ($self->{is_xml}) {
3798    
3799     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3800     } else {
3801    
3802     }
3803     $self->{state} = DATA_STATE;
3804     $self->{s_kwd} = '';
3805 wakaba 1.12 } else {
3806 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3807    
3808     } else {
3809    
3810     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3811     }
3812     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3813 wakaba 1.12 }
3814 wakaba 1.16
3815 wakaba 1.1
3816     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3817     $self->{line_prev} = $self->{line};
3818     $self->{column_prev} = $self->{column};
3819     $self->{column}++;
3820     $self->{nc}
3821     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3822     } else {
3823     $self->{set_nc}->($self);
3824     }
3825    
3826 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3827 wakaba 1.1 redo A;
3828     } elsif ($self->{nc} == -1) {
3829 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3830    
3831     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3832    
3833     $self->{state} = DATA_STATE;
3834     $self->{s_kwd} = '';
3835     $self->{ct}->{quirks} = 1;
3836     } else {
3837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3838     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839     }
3840 wakaba 1.1
3841     ## reconsume
3842 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3843 wakaba 1.1 redo A;
3844 wakaba 1.16 } elsif ($self->{is_xml} and
3845     $self->{ct}->{type} == DOCTYPE_TOKEN and
3846     $self->{nc} == 0x005B) { # [
3847 wakaba 1.12
3848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3849     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3850     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3851 wakaba 1.13 $self->{in_subset} = 1;
3852 wakaba 1.12
3853     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3854     $self->{line_prev} = $self->{line};
3855     $self->{column_prev} = $self->{column};
3856     $self->{column}++;
3857     $self->{nc}
3858     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3859     } else {
3860     $self->{set_nc}->($self);
3861     }
3862    
3863 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3864 wakaba 1.12 redo A;
3865 wakaba 1.1 } else {
3866     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3867    
3868 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3869    
3870     $self->{ct}->{quirks} = 1;
3871     $self->{state} = BOGUS_DOCTYPE_STATE;
3872     } else {
3873    
3874     $self->{state} = BOGUS_MD_STATE;
3875     }
3876    
3877 wakaba 1.1
3878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3879     $self->{line_prev} = $self->{line};
3880     $self->{column_prev} = $self->{column};
3881     $self->{column}++;
3882     $self->{nc}
3883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3884     } else {
3885     $self->{set_nc}->($self);
3886     }
3887    
3888     redo A;
3889     }
3890     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3891     if ($is_space->{$self->{nc}}) {
3892    
3893     ## Stay in the state
3894    
3895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3896     $self->{line_prev} = $self->{line};
3897     $self->{column_prev} = $self->{column};
3898     $self->{column}++;
3899     $self->{nc}
3900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3901     } else {
3902     $self->{set_nc}->($self);
3903     }
3904    
3905     redo A;
3906     } elsif ($self->{nc} == 0x0022) { # "
3907    
3908     $self->{ct}->{sysid} = ''; # DOCTYPE
3909     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3910    
3911     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3912     $self->{line_prev} = $self->{line};
3913     $self->{column_prev} = $self->{column};
3914     $self->{column}++;
3915     $self->{nc}
3916     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3917     } else {
3918     $self->{set_nc}->($self);
3919     }
3920    
3921     redo A;
3922     } elsif ($self->{nc} == 0x0027) { # '
3923    
3924     $self->{ct}->{sysid} = ''; # DOCTYPE
3925     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3926    
3927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3928     $self->{line_prev} = $self->{line};
3929     $self->{column_prev} = $self->{column};
3930     $self->{column}++;
3931     $self->{nc}
3932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3933     } else {
3934     $self->{set_nc}->($self);
3935     }
3936    
3937     redo A;
3938     } elsif ($self->{nc} == 0x003E) { # >
3939     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3940    
3941     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3942     $self->{line_prev} = $self->{line};
3943     $self->{column_prev} = $self->{column};
3944     $self->{column}++;
3945     $self->{nc}
3946     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3947     } else {
3948     $self->{set_nc}->($self);
3949     }
3950    
3951    
3952 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3953    
3954     $self->{state} = DATA_STATE;
3955     $self->{s_kwd} = '';
3956     $self->{ct}->{quirks} = 1;
3957     } else {
3958    
3959     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3960     }
3961 wakaba 1.1
3962 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3963 wakaba 1.1 redo A;
3964     } elsif ($self->{nc} == -1) {
3965 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3966    
3967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3968     $self->{state} = DATA_STATE;
3969     $self->{s_kwd} = '';
3970     $self->{ct}->{quirks} = 1;
3971     } else {
3972    
3973     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3974     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3975     }
3976 wakaba 1.1
3977     ## reconsume
3978 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3979 wakaba 1.1 redo A;
3980 wakaba 1.16 } elsif ($self->{is_xml} and
3981     $self->{ct}->{type} == DOCTYPE_TOKEN and
3982     $self->{nc} == 0x005B) { # [
3983 wakaba 1.12
3984     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
3986     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3987     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3988 wakaba 1.13 $self->{in_subset} = 1;
3989 wakaba 1.12
3990     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3991     $self->{line_prev} = $self->{line};
3992     $self->{column_prev} = $self->{column};
3993     $self->{column}++;
3994     $self->{nc}
3995     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3996     } else {
3997     $self->{set_nc}->($self);
3998     }
3999    
4000 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4001 wakaba 1.12 redo A;
4002 wakaba 1.1 } else {
4003     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4004    
4005 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4006    
4007     $self->{ct}->{quirks} = 1;
4008     $self->{state} = BOGUS_DOCTYPE_STATE;
4009     } else {
4010    
4011     $self->{state} = BOGUS_MD_STATE;
4012     }
4013    
4014 wakaba 1.1
4015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4016     $self->{line_prev} = $self->{line};
4017     $self->{column_prev} = $self->{column};
4018     $self->{column}++;
4019     $self->{nc}
4020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4021     } else {
4022     $self->{set_nc}->($self);
4023     }
4024    
4025     redo A;
4026     }
4027     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4028     if ($self->{nc} == 0x0022) { # "
4029    
4030     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4031    
4032     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4033     $self->{line_prev} = $self->{line};
4034     $self->{column_prev} = $self->{column};
4035     $self->{column}++;
4036     $self->{nc}
4037     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4038     } else {
4039     $self->{set_nc}->($self);
4040     }
4041    
4042     redo A;
4043 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4044 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4045    
4046 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4047    
4048     $self->{state} = DATA_STATE;
4049     $self->{s_kwd} = '';
4050     $self->{ct}->{quirks} = 1;
4051     } else {
4052    
4053     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054     }
4055    
4056 wakaba 1.1
4057     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4058     $self->{line_prev} = $self->{line};
4059     $self->{column_prev} = $self->{column};
4060     $self->{column}++;
4061     $self->{nc}
4062     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4063     } else {
4064     $self->{set_nc}->($self);
4065     }
4066    
4067 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4068 wakaba 1.1 redo A;
4069     } elsif ($self->{nc} == -1) {
4070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4071    
4072 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4073    
4074     $self->{state} = DATA_STATE;
4075     $self->{s_kwd} = '';
4076     $self->{ct}->{quirks} = 1;
4077     } else {
4078    
4079     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4080     }
4081    
4082 wakaba 1.1 ## reconsume
4083 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4084 wakaba 1.1 redo A;
4085     } else {
4086    
4087 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4088 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4089     length $self->{ct}->{sysid});
4090    
4091     ## Stay in the state
4092    
4093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4094     $self->{line_prev} = $self->{line};
4095     $self->{column_prev} = $self->{column};
4096     $self->{column}++;
4097     $self->{nc}
4098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4099     } else {
4100     $self->{set_nc}->($self);
4101     }
4102    
4103     redo A;
4104     }
4105     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4106     if ($self->{nc} == 0x0027) { # '
4107    
4108     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4109    
4110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4111     $self->{line_prev} = $self->{line};
4112     $self->{column_prev} = $self->{column};
4113     $self->{column}++;
4114     $self->{nc}
4115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4116     } else {
4117     $self->{set_nc}->($self);
4118     }
4119    
4120     redo A;
4121 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4122 wakaba 1.1
4123     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4124    
4125     $self->{state} = DATA_STATE;
4126 wakaba 1.5 $self->{s_kwd} = '';
4127 wakaba 1.1
4128     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4129     $self->{line_prev} = $self->{line};
4130     $self->{column_prev} = $self->{column};
4131     $self->{column}++;
4132     $self->{nc}
4133     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4134     } else {
4135     $self->{set_nc}->($self);
4136     }
4137    
4138    
4139     $self->{ct}->{quirks} = 1;
4140     return ($self->{ct}); # DOCTYPE
4141    
4142     redo A;
4143     } elsif ($self->{nc} == -1) {
4144     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4145    
4146 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4147    
4148     $self->{state} = DATA_STATE;
4149     $self->{s_kwd} = '';
4150     $self->{ct}->{quirks} = 1;
4151     } else {
4152    
4153     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4154     }
4155    
4156 wakaba 1.1 ## reconsume
4157 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4158 wakaba 1.1 redo A;
4159     } else {
4160    
4161 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4162 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4163     length $self->{ct}->{sysid});
4164    
4165     ## Stay in the state
4166    
4167     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4168     $self->{line_prev} = $self->{line};
4169     $self->{column_prev} = $self->{column};
4170     $self->{column}++;
4171     $self->{nc}
4172     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4173     } else {
4174     $self->{set_nc}->($self);
4175     }
4176    
4177     redo A;
4178     }
4179     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4180     if ($is_space->{$self->{nc}}) {
4181 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4182    
4183     $self->{state} = BEFORE_NDATA_STATE;
4184     } else {
4185    
4186     ## Stay in the state
4187     }
4188 wakaba 1.1
4189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190     $self->{line_prev} = $self->{line};
4191     $self->{column_prev} = $self->{column};
4192     $self->{column}++;
4193     $self->{nc}
4194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4195     } else {
4196     $self->{set_nc}->($self);
4197     }
4198    
4199     redo A;
4200     } elsif ($self->{nc} == 0x003E) { # >
4201 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4202    
4203     $self->{state} = DATA_STATE;
4204     $self->{s_kwd} = '';
4205     } else {
4206    
4207     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208     }
4209    
4210 wakaba 1.1
4211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4212     $self->{line_prev} = $self->{line};
4213     $self->{column_prev} = $self->{column};
4214     $self->{column}++;
4215     $self->{nc}
4216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4217     } else {
4218     $self->{set_nc}->($self);
4219     }
4220    
4221 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4222 wakaba 1.1 redo A;
4223 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4224     ($self->{nc} == 0x004E or # N
4225     $self->{nc} == 0x006E)) { # n
4226    
4227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4228     $self->{state} = NDATA_STATE;
4229     $self->{kwd} = chr $self->{nc};
4230    
4231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232     $self->{line_prev} = $self->{line};
4233     $self->{column_prev} = $self->{column};
4234     $self->{column}++;
4235     $self->{nc}
4236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4237     } else {
4238     $self->{set_nc}->($self);
4239     }
4240    
4241     redo A;
4242 wakaba 1.1 } elsif ($self->{nc} == -1) {
4243 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4244    
4245     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4246     $self->{state} = DATA_STATE;
4247     $self->{s_kwd} = '';
4248     $self->{ct}->{quirks} = 1;
4249     } else {
4250    
4251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253     }
4254    
4255 wakaba 1.1 ## reconsume
4256 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4257 wakaba 1.1 redo A;
4258 wakaba 1.16 } elsif ($self->{is_xml} and
4259     $self->{ct}->{type} == DOCTYPE_TOKEN and
4260     $self->{nc} == 0x005B) { # [
4261 wakaba 1.12
4262     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4264 wakaba 1.13 $self->{in_subset} = 1;
4265 wakaba 1.12
4266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4267     $self->{line_prev} = $self->{line};
4268     $self->{column_prev} = $self->{column};
4269     $self->{column}++;
4270     $self->{nc}
4271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4272     } else {
4273     $self->{set_nc}->($self);
4274     }
4275    
4276 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4277 wakaba 1.12 redo A;
4278 wakaba 1.1 } else {
4279     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4280    
4281 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282    
4283     #$self->{ct}->{quirks} = 1;
4284     $self->{state} = BOGUS_DOCTYPE_STATE;
4285     } else {
4286    
4287     $self->{state} = BOGUS_MD_STATE;
4288     }
4289    
4290 wakaba 1.1
4291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4292     $self->{line_prev} = $self->{line};
4293     $self->{column_prev} = $self->{column};
4294     $self->{column}++;
4295     $self->{nc}
4296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4297     } else {
4298     $self->{set_nc}->($self);
4299     }
4300    
4301     redo A;
4302     }
4303 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4304     if ($is_space->{$self->{nc}}) {
4305    
4306     ## Stay in the state.
4307    
4308     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4309     $self->{line_prev} = $self->{line};
4310     $self->{column_prev} = $self->{column};
4311     $self->{column}++;
4312     $self->{nc}
4313     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4314     } else {
4315     $self->{set_nc}->($self);
4316     }
4317    
4318     redo A;
4319     } elsif ($self->{nc} == 0x003E) { # >
4320    
4321     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4322    
4323     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324     $self->{line_prev} = $self->{line};
4325     $self->{column_prev} = $self->{column};
4326     $self->{column}++;
4327     $self->{nc}
4328     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329     } else {
4330     $self->{set_nc}->($self);
4331     }
4332    
4333     return ($self->{ct}); # ENTITY
4334     redo A;
4335     } elsif ($self->{nc} == 0x004E or # N
4336     $self->{nc} == 0x006E) { # n
4337    
4338     $self->{state} = NDATA_STATE;
4339     $self->{kwd} = chr $self->{nc};
4340    
4341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342     $self->{line_prev} = $self->{line};
4343     $self->{column_prev} = $self->{column};
4344     $self->{column}++;
4345     $self->{nc}
4346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347     } else {
4348     $self->{set_nc}->($self);
4349     }
4350    
4351     redo A;
4352     } elsif ($self->{nc} == -1) {
4353    
4354     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4355     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4356     ## reconsume
4357     return ($self->{ct}); # ENTITY
4358     redo A;
4359     } else {
4360    
4361     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4362     $self->{state} = BOGUS_MD_STATE;
4363    
4364     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4365     $self->{line_prev} = $self->{line};
4366     $self->{column_prev} = $self->{column};
4367     $self->{column}++;
4368     $self->{nc}
4369     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4370     } else {
4371     $self->{set_nc}->($self);
4372     }
4373    
4374     redo A;
4375     }
4376 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4377     if ($self->{nc} == 0x003E) { # >
4378    
4379     $self->{state} = DATA_STATE;
4380 wakaba 1.5 $self->{s_kwd} = '';
4381 wakaba 1.1
4382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4383     $self->{line_prev} = $self->{line};
4384     $self->{column_prev} = $self->{column};
4385     $self->{column}++;
4386     $self->{nc}
4387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4388     } else {
4389     $self->{set_nc}->($self);
4390     }
4391    
4392    
4393     return ($self->{ct}); # DOCTYPE
4394    
4395     redo A;
4396 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4397 wakaba 1.13
4398     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4399     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4400     $self->{in_subset} = 1;
4401    
4402 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403     $self->{line_prev} = $self->{line};
4404     $self->{column_prev} = $self->{column};
4405     $self->{column}++;
4406     $self->{nc}
4407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408     } else {
4409     $self->{set_nc}->($self);
4410     }
4411    
4412 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4413     redo A;
4414 wakaba 1.1 } elsif ($self->{nc} == -1) {
4415    
4416     $self->{state} = DATA_STATE;
4417 wakaba 1.5 $self->{s_kwd} = '';
4418 wakaba 1.1 ## reconsume
4419    
4420     return ($self->{ct}); # DOCTYPE
4421    
4422     redo A;
4423     } else {
4424    
4425     my $s = '';
4426 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4427 wakaba 1.1
4428     ## Stay in the state
4429    
4430     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4431     $self->{line_prev} = $self->{line};
4432     $self->{column_prev} = $self->{column};
4433     $self->{column}++;
4434     $self->{nc}
4435     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4436     } else {
4437     $self->{set_nc}->($self);
4438     }
4439    
4440     redo A;
4441     }
4442     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4443     ## NOTE: "CDATA section state" in the state is jointly implemented
4444     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4445     ## and |CDATA_SECTION_MSE2_STATE|.
4446 wakaba 1.10
4447     ## XML5: "CDATA state".
4448 wakaba 1.1
4449     if ($self->{nc} == 0x005D) { # ]
4450    
4451     $self->{state} = CDATA_SECTION_MSE1_STATE;
4452    
4453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4454     $self->{line_prev} = $self->{line};
4455     $self->{column_prev} = $self->{column};
4456     $self->{column}++;
4457     $self->{nc}
4458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4459     } else {
4460     $self->{set_nc}->($self);
4461     }
4462    
4463     redo A;
4464     } elsif ($self->{nc} == -1) {
4465 wakaba 1.6 if ($self->{is_xml}) {
4466 wakaba 1.8
4467 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4468 wakaba 1.8 } else {
4469    
4470 wakaba 1.6 }
4471    
4472 wakaba 1.1 $self->{state} = DATA_STATE;
4473 wakaba 1.5 $self->{s_kwd} = '';
4474 wakaba 1.10 ## Reconsume.
4475 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4476    
4477     return ($self->{ct}); # character
4478     } else {
4479    
4480     ## No token to emit. $self->{ct} is discarded.
4481     }
4482     redo A;
4483     } else {
4484    
4485     $self->{ct}->{data} .= chr $self->{nc};
4486     $self->{read_until}->($self->{ct}->{data},
4487     q<]>,
4488     length $self->{ct}->{data});
4489    
4490     ## Stay in the state.
4491    
4492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493     $self->{line_prev} = $self->{line};
4494     $self->{column_prev} = $self->{column};
4495     $self->{column}++;
4496     $self->{nc}
4497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498     } else {
4499     $self->{set_nc}->($self);
4500     }
4501    
4502     redo A;
4503     }
4504    
4505     ## ISSUE: "text tokens" in spec.
4506     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4507 wakaba 1.10 ## XML5: "CDATA bracket state".
4508    
4509 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4510    
4511     $self->{state} = CDATA_SECTION_MSE2_STATE;
4512    
4513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4514     $self->{line_prev} = $self->{line};
4515     $self->{column_prev} = $self->{column};
4516     $self->{column}++;
4517     $self->{nc}
4518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4519     } else {
4520     $self->{set_nc}->($self);
4521     }
4522    
4523     redo A;
4524     } else {
4525    
4526 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4527 wakaba 1.1 $self->{ct}->{data} .= ']';
4528 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4529 wakaba 1.1 ## Reconsume.
4530     redo A;
4531     }
4532     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4533 wakaba 1.10 ## XML5: "CDATA end state".
4534    
4535 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4536     $self->{state} = DATA_STATE;
4537 wakaba 1.5 $self->{s_kwd} = '';
4538 wakaba 1.1
4539     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540     $self->{line_prev} = $self->{line};
4541     $self->{column_prev} = $self->{column};
4542     $self->{column}++;
4543     $self->{nc}
4544     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545     } else {
4546     $self->{set_nc}->($self);
4547     }
4548    
4549     if (length $self->{ct}->{data}) { # character
4550    
4551     return ($self->{ct}); # character
4552     } else {
4553    
4554     ## No token to emit. $self->{ct} is discarded.
4555     }
4556     redo A;
4557     } elsif ($self->{nc} == 0x005D) { # ]
4558     # character
4559     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4560     ## Stay in the state.
4561    
4562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4563     $self->{line_prev} = $self->{line};
4564     $self->{column_prev} = $self->{column};
4565     $self->{column}++;
4566     $self->{nc}
4567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4568     } else {
4569     $self->{set_nc}->($self);
4570     }
4571    
4572     redo A;
4573     } else {
4574    
4575     $self->{ct}->{data} .= ']]'; # character
4576     $self->{state} = CDATA_SECTION_STATE;
4577 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4578 wakaba 1.1 redo A;
4579     }
4580     } elsif ($self->{state} == ENTITY_STATE) {
4581     if ($is_space->{$self->{nc}} or
4582     {
4583     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4584     $self->{entity_add} => 1,
4585     }->{$self->{nc}}) {
4586    
4587     ## Don't consume
4588     ## No error
4589     ## Return nothing.
4590     #
4591     } elsif ($self->{nc} == 0x0023) { # #
4592    
4593     $self->{state} = ENTITY_HASH_STATE;
4594 wakaba 1.12 $self->{kwd} = '#';
4595 wakaba 1.1
4596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597     $self->{line_prev} = $self->{line};
4598     $self->{column_prev} = $self->{column};
4599     $self->{column}++;
4600     $self->{nc}
4601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602     } else {
4603     $self->{set_nc}->($self);
4604     }
4605    
4606     redo A;
4607     } elsif ((0x0041 <= $self->{nc} and
4608     $self->{nc} <= 0x005A) or # A..Z
4609     (0x0061 <= $self->{nc} and
4610     $self->{nc} <= 0x007A)) { # a..z
4611    
4612     require Whatpm::_NamedEntityList;
4613     $self->{state} = ENTITY_NAME_STATE;
4614 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4615     $self->{entity__value} = $self->{kwd};
4616 wakaba 1.1 $self->{entity__match} = 0;
4617    
4618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4619     $self->{line_prev} = $self->{line};
4620     $self->{column_prev} = $self->{column};
4621     $self->{column}++;
4622     $self->{nc}
4623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4624     } else {
4625     $self->{set_nc}->($self);
4626     }
4627    
4628     redo A;
4629     } else {
4630    
4631     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4632     ## Return nothing.
4633     #
4634     }
4635    
4636     ## NOTE: No character is consumed by the "consume a character
4637     ## reference" algorithm. In other word, there is an "&" character
4638     ## that does not introduce a character reference, which would be
4639     ## appended to the parent element or the attribute value in later
4640     ## process of the tokenizer.
4641    
4642     if ($self->{prev_state} == DATA_STATE) {
4643    
4644     $self->{state} = $self->{prev_state};
4645 wakaba 1.5 $self->{s_kwd} = '';
4646 wakaba 1.1 ## Reconsume.
4647     return ({type => CHARACTER_TOKEN, data => '&',
4648     line => $self->{line_prev},
4649     column => $self->{column_prev},
4650     });
4651     redo A;
4652     } else {
4653    
4654     $self->{ca}->{value} .= '&';
4655     $self->{state} = $self->{prev_state};
4656 wakaba 1.5 $self->{s_kwd} = '';
4657 wakaba 1.1 ## Reconsume.
4658     redo A;
4659     }
4660     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4661     if ($self->{nc} == 0x0078 or # x
4662     $self->{nc} == 0x0058) { # X
4663    
4664     $self->{state} = HEXREF_X_STATE;
4665 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4666 wakaba 1.1
4667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4668     $self->{line_prev} = $self->{line};
4669     $self->{column_prev} = $self->{column};
4670     $self->{column}++;
4671     $self->{nc}
4672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4673     } else {
4674     $self->{set_nc}->($self);
4675     }
4676    
4677     redo A;
4678     } elsif (0x0030 <= $self->{nc} and
4679     $self->{nc} <= 0x0039) { # 0..9
4680    
4681     $self->{state} = NCR_NUM_STATE;
4682 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4683 wakaba 1.1
4684     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4685     $self->{line_prev} = $self->{line};
4686     $self->{column_prev} = $self->{column};
4687     $self->{column}++;
4688     $self->{nc}
4689     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4690     } else {
4691     $self->{set_nc}->($self);
4692     }
4693    
4694     redo A;
4695     } else {
4696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4697     line => $self->{line_prev},
4698     column => $self->{column_prev} - 1);
4699    
4700     ## NOTE: According to the spec algorithm, nothing is returned,
4701     ## and then "&#" is appended to the parent element or the attribute
4702     ## value in the later processing.
4703    
4704     if ($self->{prev_state} == DATA_STATE) {
4705    
4706     $self->{state} = $self->{prev_state};
4707 wakaba 1.5 $self->{s_kwd} = '';
4708 wakaba 1.1 ## Reconsume.
4709     return ({type => CHARACTER_TOKEN,
4710     data => '&#',
4711     line => $self->{line_prev},
4712     column => $self->{column_prev} - 1,
4713     });
4714     redo A;
4715     } else {
4716    
4717     $self->{ca}->{value} .= '&#';
4718     $self->{state} = $self->{prev_state};
4719 wakaba 1.5 $self->{s_kwd} = '';
4720 wakaba 1.1 ## Reconsume.
4721     redo A;
4722     }
4723     }
4724     } elsif ($self->{state} == NCR_NUM_STATE) {
4725     if (0x0030 <= $self->{nc} and
4726     $self->{nc} <= 0x0039) { # 0..9
4727    
4728 wakaba 1.12 $self->{kwd} *= 10;
4729     $self->{kwd} += $self->{nc} - 0x0030;
4730 wakaba 1.1
4731     ## Stay in the state.
4732    
4733     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4734     $self->{line_prev} = $self->{line};
4735     $self->{column_prev} = $self->{column};
4736     $self->{column}++;
4737     $self->{nc}
4738     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4739     } else {
4740     $self->{set_nc}->($self);
4741     }
4742    
4743     redo A;
4744     } elsif ($self->{nc} == 0x003B) { # ;
4745    
4746    
4747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4748     $self->{line_prev} = $self->{line};
4749     $self->{column_prev} = $self->{column};
4750     $self->{column}++;
4751     $self->{nc}
4752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4753     } else {
4754     $self->{set_nc}->($self);
4755     }
4756    
4757     #
4758     } else {
4759    
4760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4761     ## Reconsume.
4762     #
4763     }
4764    
4765 wakaba 1.12 my $code = $self->{kwd};
4766 wakaba 1.1 my $l = $self->{line_prev};
4767     my $c = $self->{column_prev};
4768     if ($charref_map->{$code}) {
4769    
4770     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4771     text => (sprintf 'U+%04X', $code),
4772     line => $l, column => $c);
4773     $code = $charref_map->{$code};
4774     } elsif ($code > 0x10FFFF) {
4775    
4776     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4777     text => (sprintf 'U-%08X', $code),
4778     line => $l, column => $c);
4779     $code = 0xFFFD;
4780     }
4781    
4782     if ($self->{prev_state} == DATA_STATE) {
4783    
4784     $self->{state} = $self->{prev_state};
4785 wakaba 1.5 $self->{s_kwd} = '';
4786 wakaba 1.1 ## Reconsume.
4787     return ({type => CHARACTER_TOKEN, data => chr $code,
4788 wakaba 1.7 has_reference => 1,
4789 wakaba 1.1 line => $l, column => $c,
4790     });
4791     redo A;
4792     } else {
4793    
4794     $self->{ca}->{value} .= chr $code;
4795     $self->{ca}->{has_reference} = 1;
4796     $self->{state} = $self->{prev_state};
4797 wakaba 1.5 $self->{s_kwd} = '';
4798 wakaba 1.1 ## Reconsume.
4799     redo A;
4800     }
4801     } elsif ($self->{state} == HEXREF_X_STATE) {
4802     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4803     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4804     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4805     # 0..9, A..F, a..f
4806    
4807     $self->{state} = HEXREF_HEX_STATE;
4808 wakaba 1.12 $self->{kwd} = 0;
4809 wakaba 1.1 ## Reconsume.
4810     redo A;
4811     } else {
4812     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4813     line => $self->{line_prev},
4814     column => $self->{column_prev} - 2);
4815    
4816     ## NOTE: According to the spec algorithm, nothing is returned,
4817     ## and then "&#" followed by "X" or "x" is appended to the parent
4818     ## element or the attribute value in the later processing.
4819    
4820     if ($self->{prev_state} == DATA_STATE) {
4821    
4822     $self->{state} = $self->{prev_state};
4823 wakaba 1.5 $self->{s_kwd} = '';
4824 wakaba 1.1 ## Reconsume.
4825     return ({type => CHARACTER_TOKEN,
4826 wakaba 1.12 data => '&' . $self->{kwd},
4827 wakaba 1.1 line => $self->{line_prev},
4828 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4829 wakaba 1.1 });
4830     redo A;
4831     } else {
4832    
4833 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4834 wakaba 1.1 $self->{state} = $self->{prev_state};
4835 wakaba 1.5 $self->{s_kwd} = '';
4836 wakaba 1.1 ## Reconsume.
4837     redo A;
4838     }
4839     }
4840     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4841     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4842     # 0..9
4843    
4844 wakaba 1.12 $self->{kwd} *= 0x10;
4845     $self->{kwd} += $self->{nc} - 0x0030;
4846 wakaba 1.1 ## Stay in the state.
4847    
4848     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4849     $self->{line_prev} = $self->{line};
4850     $self->{column_prev} = $self->{column};
4851     $self->{column}++;
4852     $self->{nc}
4853     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4854     } else {
4855     $self->{set_nc}->($self);
4856     }
4857    
4858     redo A;
4859     } elsif (0x0061 <= $self->{nc} and
4860     $self->{nc} <= 0x0066) { # a..f
4861    
4862 wakaba 1.12 $self->{kwd} *= 0x10;
4863     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4864 wakaba 1.1 ## Stay in the state.
4865    
4866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4867     $self->{line_prev} = $self->{line};
4868     $self->{column_prev} = $self->{column};
4869     $self->{column}++;
4870     $self->{nc}
4871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4872     } else {
4873     $self->{set_nc}->($self);
4874     }
4875    
4876     redo A;
4877     } elsif (0x0041 <= $self->{nc} and
4878     $self->{nc} <= 0x0046) { # A..F
4879    
4880 wakaba 1.12 $self->{kwd} *= 0x10;
4881     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4882 wakaba 1.1 ## Stay in the state.
4883    
4884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4885     $self->{line_prev} = $self->{line};
4886     $self->{column_prev} = $self->{column};
4887     $self->{column}++;
4888     $self->{nc}
4889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4890     } else {
4891     $self->{set_nc}->($self);
4892     }
4893    
4894     redo A;
4895     } elsif ($self->{nc} == 0x003B) { # ;
4896    
4897    
4898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4899     $self->{line_prev} = $self->{line};
4900     $self->{column_prev} = $self->{column};
4901     $self->{column}++;
4902     $self->{nc}
4903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4904     } else {
4905     $self->{set_nc}->($self);
4906     }
4907    
4908     #
4909     } else {
4910    
4911     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4912     line => $self->{line},
4913     column => $self->{column});
4914     ## Reconsume.
4915     #
4916     }
4917    
4918 wakaba 1.12 my $code = $self->{kwd};
4919 wakaba 1.1 my $l = $self->{line_prev};
4920     my $c = $self->{column_prev};
4921     if ($charref_map->{$code}) {
4922    
4923     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4924     text => (sprintf 'U+%04X', $code),
4925     line => $l, column => $c);
4926     $code = $charref_map->{$code};
4927     } elsif ($code > 0x10FFFF) {
4928    
4929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4930     text => (sprintf 'U-%08X', $code),
4931     line => $l, column => $c);
4932     $code = 0xFFFD;
4933     }
4934    
4935     if ($self->{prev_state} == DATA_STATE) {
4936    
4937     $self->{state} = $self->{prev_state};
4938 wakaba 1.5 $self->{s_kwd} = '';
4939 wakaba 1.1 ## Reconsume.
4940     return ({type => CHARACTER_TOKEN, data => chr $code,
4941 wakaba 1.7 has_reference => 1,
4942 wakaba 1.1 line => $l, column => $c,
4943     });
4944     redo A;
4945     } else {
4946    
4947     $self->{ca}->{value} .= chr $code;
4948     $self->{ca}->{has_reference} = 1;
4949     $self->{state} = $self->{prev_state};
4950 wakaba 1.5 $self->{s_kwd} = '';
4951 wakaba 1.1 ## Reconsume.
4952     redo A;
4953     }
4954     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4955 wakaba 1.12 if (length $self->{kwd} < 30 and
4956 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4957     ((0x0041 <= $self->{nc} and # a
4958     $self->{nc} <= 0x005A) or # x
4959     (0x0061 <= $self->{nc} and # a
4960     $self->{nc} <= 0x007A) or # z
4961     (0x0030 <= $self->{nc} and # 0
4962     $self->{nc} <= 0x0039) or # 9
4963     $self->{nc} == 0x003B)) { # ;
4964     our $EntityChar;
4965 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4966     if (defined $EntityChar->{$self->{kwd}}) {
4967 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
4968    
4969 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4970 wakaba 1.1 $self->{entity__match} = 1;
4971    
4972     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4973     $self->{line_prev} = $self->{line};
4974     $self->{column_prev} = $self->{column};
4975     $self->{column}++;
4976     $self->{nc}
4977     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4978     } else {
4979     $self->{set_nc}->($self);
4980     }
4981    
4982     #
4983     } else {
4984    
4985 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
4986 wakaba 1.1 $self->{entity__match} = -1;
4987     ## Stay in the state.
4988    
4989     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4990     $self->{line_prev} = $self->{line};
4991     $self->{column_prev} = $self->{column};
4992     $self->{column}++;
4993     $self->{nc}
4994     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4995     } else {
4996     $self->{set_nc}->($self);
4997     }
4998    
4999     redo A;
5000     }
5001     } else {
5002    
5003     $self->{entity__value} .= chr $self->{nc};
5004     $self->{entity__match} *= 2;
5005     ## Stay in the state.
5006    
5007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5008     $self->{line_prev} = $self->{line};
5009     $self->{column_prev} = $self->{column};
5010     $self->{column}++;
5011     $self->{nc}
5012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5013     } else {
5014     $self->{set_nc}->($self);
5015     }
5016    
5017     redo A;
5018     }
5019     }
5020    
5021     my $data;
5022     my $has_ref;
5023     if ($self->{entity__match} > 0) {
5024    
5025     $data = $self->{entity__value};
5026     $has_ref = 1;
5027     #
5028     } elsif ($self->{entity__match} < 0) {
5029     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5030     if ($self->{prev_state} != DATA_STATE and # in attribute
5031     $self->{entity__match} < -1) {
5032    
5033 wakaba 1.12 $data = '&' . $self->{kwd};
5034 wakaba 1.1 #
5035     } else {
5036    
5037     $data = $self->{entity__value};
5038     $has_ref = 1;
5039     #
5040     }
5041     } else {
5042    
5043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5044     line => $self->{line_prev},
5045 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5046     $data = '&' . $self->{kwd};
5047 wakaba 1.1 #
5048     }
5049    
5050     ## NOTE: In these cases, when a character reference is found,
5051     ## it is consumed and a character token is returned, or, otherwise,
5052     ## nothing is consumed and returned, according to the spec algorithm.
5053     ## In this implementation, anything that has been examined by the
5054     ## tokenizer is appended to the parent element or the attribute value
5055     ## as string, either literal string when no character reference or
5056     ## entity-replaced string otherwise, in this stage, since any characters
5057     ## that would not be consumed are appended in the data state or in an
5058     ## appropriate attribute value state anyway.
5059    
5060     if ($self->{prev_state} == DATA_STATE) {
5061    
5062     $self->{state} = $self->{prev_state};
5063 wakaba 1.5 $self->{s_kwd} = '';
5064 wakaba 1.1 ## Reconsume.
5065     return ({type => CHARACTER_TOKEN,
5066     data => $data,
5067 wakaba 1.7 has_reference => $has_ref,
5068 wakaba 1.1 line => $self->{line_prev},
5069 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5070 wakaba 1.1 });
5071     redo A;
5072     } else {
5073    
5074     $self->{ca}->{value} .= $data;
5075     $self->{ca}->{has_reference} = 1 if $has_ref;
5076     $self->{state} = $self->{prev_state};
5077 wakaba 1.5 $self->{s_kwd} = '';
5078 wakaba 1.1 ## Reconsume.
5079     redo A;
5080     }
5081 wakaba 1.8
5082     ## XML-only states
5083    
5084     } elsif ($self->{state} == PI_STATE) {
5085 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5086    
5087 wakaba 1.8 if ($is_space->{$self->{nc}} or
5088 wakaba 1.14 $self->{nc} == 0x003F or # ?
5089 wakaba 1.8 $self->{nc} == -1) {
5090 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5091     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5092     ## "DOCTYPE pi state": Parse error, switch to the "data
5093     ## state".
5094 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5095     line => $self->{line_prev},
5096     column => $self->{column_prev}
5097     - 1 * ($self->{nc} != -1));
5098     $self->{state} = BOGUS_COMMENT_STATE;
5099     ## Reconsume.
5100     $self->{ct} = {type => COMMENT_TOKEN,
5101     data => '?',
5102     line => $self->{line_prev},
5103     column => $self->{column_prev}
5104     - 1 * ($self->{nc} != -1),
5105     };
5106     redo A;
5107     } else {
5108 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5109 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5110     target => chr $self->{nc},
5111     data => '',
5112     line => $self->{line_prev},
5113     column => $self->{column_prev} - 1,
5114     };
5115     $self->{state} = PI_TARGET_STATE;
5116    
5117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5118     $self->{line_prev} = $self->{line};
5119     $self->{column_prev} = $self->{column};
5120     $self->{column}++;
5121     $self->{nc}
5122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5123     } else {
5124     $self->{set_nc}->($self);
5125     }
5126    
5127     redo A;
5128     }
5129     } elsif ($self->{state} == PI_TARGET_STATE) {
5130     if ($is_space->{$self->{nc}}) {
5131     $self->{state} = PI_TARGET_AFTER_STATE;
5132    
5133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5134     $self->{line_prev} = $self->{line};
5135     $self->{column_prev} = $self->{column};
5136     $self->{column}++;
5137     $self->{nc}
5138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5139     } else {
5140     $self->{set_nc}->($self);
5141     }
5142    
5143     redo A;
5144     } elsif ($self->{nc} == -1) {
5145     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5146 wakaba 1.13 if ($self->{in_subset}) {
5147     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5148     } else {
5149     $self->{state} = DATA_STATE;
5150     $self->{s_kwd} = '';
5151     }
5152 wakaba 1.8 ## Reconsume.
5153     return ($self->{ct}); # pi
5154     redo A;
5155     } elsif ($self->{nc} == 0x003F) { # ?
5156     $self->{state} = PI_AFTER_STATE;
5157    
5158     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5159     $self->{line_prev} = $self->{line};
5160     $self->{column_prev} = $self->{column};
5161     $self->{column}++;
5162     $self->{nc}
5163     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5164     } else {
5165     $self->{set_nc}->($self);
5166     }
5167    
5168     redo A;
5169     } else {
5170     ## XML5: typo ("tag name" -> "target")
5171     $self->{ct}->{target} .= chr $self->{nc}; # pi
5172    
5173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5174     $self->{line_prev} = $self->{line};
5175     $self->{column_prev} = $self->{column};
5176     $self->{column}++;
5177     $self->{nc}
5178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5179     } else {
5180     $self->{set_nc}->($self);
5181     }
5182    
5183     redo A;
5184     }
5185     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5186     if ($is_space->{$self->{nc}}) {
5187     ## Stay in the state.
5188    
5189     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5190     $self->{line_prev} = $self->{line};
5191     $self->{column_prev} = $self->{column};
5192     $self->{column}++;
5193     $self->{nc}
5194     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5195     } else {
5196     $self->{set_nc}->($self);
5197     }
5198    
5199     redo A;
5200     } else {
5201     $self->{state} = PI_DATA_STATE;
5202     ## Reprocess.
5203     redo A;
5204     }
5205     } elsif ($self->{state} == PI_DATA_STATE) {
5206     if ($self->{nc} == 0x003F) { # ?
5207     $self->{state} = PI_DATA_AFTER_STATE;
5208    
5209     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5210     $self->{line_prev} = $self->{line};
5211     $self->{column_prev} = $self->{column};
5212     $self->{column}++;
5213     $self->{nc}
5214     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5215     } else {
5216     $self->{set_nc}->($self);
5217     }
5218    
5219     redo A;
5220     } elsif ($self->{nc} == -1) {
5221     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5222 wakaba 1.13 if ($self->{in_subset}) {
5223 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5224 wakaba 1.13 } else {
5225     $self->{state} = DATA_STATE;
5226     $self->{s_kwd} = '';
5227     }
5228 wakaba 1.8 ## Reprocess.
5229     return ($self->{ct}); # pi
5230     redo A;
5231     } else {
5232     $self->{ct}->{data} .= chr $self->{nc}; # pi
5233     $self->{read_until}->($self->{ct}->{data}, q[?],
5234     length $self->{ct}->{data});
5235     ## Stay in the state.
5236    
5237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5238     $self->{line_prev} = $self->{line};
5239     $self->{column_prev} = $self->{column};
5240     $self->{column}++;
5241     $self->{nc}
5242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5243     } else {
5244     $self->{set_nc}->($self);
5245     }
5246    
5247     ## Reprocess.
5248     redo A;
5249     }
5250     } elsif ($self->{state} == PI_AFTER_STATE) {
5251 wakaba 1.14 ## XML5: Part of "Pi after state".
5252    
5253 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5254 wakaba 1.13 if ($self->{in_subset}) {
5255     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5256     } else {
5257     $self->{state} = DATA_STATE;
5258     $self->{s_kwd} = '';
5259     }
5260 wakaba 1.8
5261     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5262     $self->{line_prev} = $self->{line};
5263     $self->{column_prev} = $self->{column};
5264     $self->{column}++;
5265     $self->{nc}
5266     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5267     } else {
5268     $self->{set_nc}->($self);
5269     }
5270    
5271     return ($self->{ct}); # pi
5272     redo A;
5273     } elsif ($self->{nc} == 0x003F) { # ?
5274     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5275     line => $self->{line_prev},
5276     column => $self->{column_prev}); ## XML5: no error
5277     $self->{ct}->{data} .= '?';
5278     $self->{state} = PI_DATA_AFTER_STATE;
5279    
5280     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281     $self->{line_prev} = $self->{line};
5282     $self->{column_prev} = $self->{column};
5283     $self->{column}++;
5284     $self->{nc}
5285     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286     } else {
5287     $self->{set_nc}->($self);
5288     }
5289    
5290     redo A;
5291     } else {
5292     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5293     line => $self->{line_prev},
5294     column => $self->{column_prev}
5295     + 1 * ($self->{nc} == -1)); ## XML5: no error
5296     $self->{ct}->{data} .= '?'; ## XML5: not appended
5297     $self->{state} = PI_DATA_STATE;
5298     ## Reprocess.
5299     redo A;
5300     }
5301     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5302 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5303    
5304 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5305 wakaba 1.13 if ($self->{in_subset}) {
5306     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5307     } else {
5308     $self->{state} = DATA_STATE;
5309     $self->{s_kwd} = '';
5310     }
5311 wakaba 1.8
5312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5313     $self->{line_prev} = $self->{line};
5314     $self->{column_prev} = $self->{column};
5315     $self->{column}++;
5316     $self->{nc}
5317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5318     } else {
5319     $self->{set_nc}->($self);
5320     }
5321    
5322     return ($self->{ct}); # pi
5323     redo A;
5324     } elsif ($self->{nc} == 0x003F) { # ?
5325     $self->{ct}->{data} .= '?';
5326     ## Stay in the state.
5327    
5328     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5329     $self->{line_prev} = $self->{line};
5330     $self->{column_prev} = $self->{column};
5331     $self->{column}++;
5332     $self->{nc}
5333     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5334     } else {
5335     $self->{set_nc}->($self);
5336     }
5337    
5338     redo A;
5339     } else {
5340     $self->{ct}->{data} .= '?'; ## XML5: not appended
5341     $self->{state} = PI_DATA_STATE;
5342     ## Reprocess.
5343     redo A;
5344     }
5345 wakaba 1.12
5346     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5347     if ($self->{nc} == 0x003C) { # <
5348 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5349 wakaba 1.12
5350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5351     $self->{line_prev} = $self->{line};
5352     $self->{column_prev} = $self->{column};
5353     $self->{column}++;
5354     $self->{nc}
5355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5356     } else {
5357     $self->{set_nc}->($self);
5358     }
5359    
5360     redo A;
5361     } elsif ($self->{nc} == 0x0025) { # %
5362     ## XML5: Not defined yet.
5363    
5364     ## TODO:
5365    
5366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5367     $self->{line_prev} = $self->{line};
5368     $self->{column_prev} = $self->{column};
5369     $self->{column}++;
5370     $self->{nc}
5371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5372     } else {
5373     $self->{set_nc}->($self);
5374     }
5375    
5376     redo A;
5377     } elsif ($self->{nc} == 0x005D) { # ]
5378 wakaba 1.13 delete $self->{in_subset};
5379 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5380    
5381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5382     $self->{line_prev} = $self->{line};
5383     $self->{column_prev} = $self->{column};
5384     $self->{column}++;
5385     $self->{nc}
5386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5387     } else {
5388     $self->{set_nc}->($self);
5389     }
5390    
5391     redo A;
5392     } elsif ($is_space->{$self->{nc}}) {
5393     ## Stay in the state.
5394    
5395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5396     $self->{line_prev} = $self->{line};
5397     $self->{column_prev} = $self->{column};
5398     $self->{column}++;
5399     $self->{nc}
5400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5401     } else {
5402     $self->{set_nc}->($self);
5403     }
5404    
5405     redo A;
5406     } elsif ($self->{nc} == -1) {
5407     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5408 wakaba 1.13 delete $self->{in_subset};
5409 wakaba 1.12 $self->{state} = DATA_STATE;
5410     $self->{s_kwd} = '';
5411     ## Reconsume.
5412 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5413 wakaba 1.12 redo A;
5414     } else {
5415     unless ($self->{internal_subset_tainted}) {
5416     ## XML5: No parse error.
5417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5418     $self->{internal_subset_tainted} = 1;
5419     }
5420     ## Stay in the state.
5421    
5422     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5423     $self->{line_prev} = $self->{line};
5424     $self->{column_prev} = $self->{column};
5425     $self->{column}++;
5426     $self->{nc}
5427     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5428     } else {
5429     $self->{set_nc}->($self);
5430     }
5431    
5432     redo A;
5433     }
5434     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5435     if ($self->{nc} == 0x003E) { # >
5436     $self->{state} = DATA_STATE;
5437     $self->{s_kwd} = '';
5438    
5439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5440     $self->{line_prev} = $self->{line};
5441     $self->{column_prev} = $self->{column};
5442     $self->{column}++;
5443     $self->{nc}
5444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5445     } else {
5446     $self->{set_nc}->($self);
5447     }
5448    
5449 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5450 wakaba 1.12 redo A;
5451     } elsif ($self->{nc} == -1) {
5452     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5453     $self->{state} = DATA_STATE;
5454     $self->{s_kwd} = '';
5455     ## Reconsume.
5456 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5457 wakaba 1.12 redo A;
5458     } else {
5459     ## XML5: No parse error and stay in the state.
5460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5461    
5462 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5463    
5464     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5465     $self->{line_prev} = $self->{line};
5466     $self->{column_prev} = $self->{column};
5467     $self->{column}++;
5468     $self->{nc}
5469     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5470     } else {
5471     $self->{set_nc}->($self);
5472     }
5473    
5474     redo A;
5475     }
5476     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5477     if ($self->{nc} == 0x003E) { # >
5478     $self->{state} = DATA_STATE;
5479     $self->{s_kwd} = '';
5480    
5481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5482     $self->{line_prev} = $self->{line};
5483     $self->{column_prev} = $self->{column};
5484     $self->{column}++;
5485     $self->{nc}
5486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5487     } else {
5488     $self->{set_nc}->($self);
5489     }
5490    
5491     return ({type => END_OF_DOCTYPE_TOKEN});
5492     redo A;
5493     } elsif ($self->{nc} == -1) {
5494     $self->{state} = DATA_STATE;
5495     $self->{s_kwd} = '';
5496     ## Reconsume.
5497     return ({type => END_OF_DOCTYPE_TOKEN});
5498     redo A;
5499     } else {
5500     ## Stay in the state.
5501    
5502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503     $self->{line_prev} = $self->{line};
5504     $self->{column_prev} = $self->{column};
5505     $self->{column}++;
5506     $self->{nc}
5507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508     } else {
5509     $self->{set_nc}->($self);
5510     }
5511    
5512     redo A;
5513     }
5514     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5515     if ($self->{nc} == 0x0021) { # !
5516 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5517 wakaba 1.13
5518     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5519     $self->{line_prev} = $self->{line};
5520     $self->{column_prev} = $self->{column};
5521     $self->{column}++;
5522     $self->{nc}
5523     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5524     } else {
5525     $self->{set_nc}->($self);
5526     }
5527    
5528     redo A;
5529     } elsif ($self->{nc} == 0x003F) { # ?
5530     $self->{state} = PI_STATE;
5531    
5532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5533     $self->{line_prev} = $self->{line};
5534     $self->{column_prev} = $self->{column};
5535     $self->{column}++;
5536     $self->{nc}
5537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5538     } else {
5539     $self->{set_nc}->($self);
5540     }
5541    
5542     redo A;
5543     } elsif ($self->{nc} == -1) {
5544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5545     $self->{state} = DATA_STATE;
5546     $self->{s_kwd} = '';
5547     ## Reconsume.
5548     redo A;
5549     } else {
5550     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5551     line => $self->{line_prev},
5552     column => $self->{column_prev});
5553     $self->{state} = BOGUS_COMMENT_STATE;
5554     $self->{ct} = {type => COMMENT_TOKEN,
5555     data => '',
5556     }; ## NOTE: Will be discarded.
5557 wakaba 1.12
5558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5559     $self->{line_prev} = $self->{line};
5560     $self->{column_prev} = $self->{column};
5561     $self->{column}++;
5562     $self->{nc}
5563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5564     } else {
5565     $self->{set_nc}->($self);
5566     }
5567    
5568     redo A;
5569     }
5570 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5571     ## XML5: "DOCTYPE markup declaration state".
5572    
5573     if ($self->{nc} == 0x002D) { # -
5574     $self->{state} = MD_HYPHEN_STATE;
5575    
5576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5577     $self->{line_prev} = $self->{line};
5578     $self->{column_prev} = $self->{column};
5579     $self->{column}++;
5580     $self->{nc}
5581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5582     } else {
5583     $self->{set_nc}->($self);
5584     }
5585    
5586     redo A;
5587 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5588     $self->{nc} == 0x0065) { # e
5589 wakaba 1.14 $self->{state} = MD_E_STATE;
5590     $self->{kwd} = chr $self->{nc};
5591    
5592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5593     $self->{line_prev} = $self->{line};
5594     $self->{column_prev} = $self->{column};
5595     $self->{column}++;
5596     $self->{nc}
5597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5598     } else {
5599     $self->{set_nc}->($self);
5600     }
5601    
5602     redo A;
5603 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5604     $self->{nc} == 0x0061) { # a
5605 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5606     $self->{kwd} = chr $self->{nc};
5607    
5608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5609     $self->{line_prev} = $self->{line};
5610     $self->{column_prev} = $self->{column};
5611     $self->{column}++;
5612     $self->{nc}
5613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5614     } else {
5615     $self->{set_nc}->($self);
5616     }
5617    
5618     redo A;
5619 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5620     $self->{nc} == 0x006E) { # n
5621 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5622     $self->{kwd} = chr $self->{nc};
5623    
5624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625     $self->{line_prev} = $self->{line};
5626     $self->{column_prev} = $self->{column};
5627     $self->{column}++;
5628     $self->{nc}
5629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630     } else {
5631     $self->{set_nc}->($self);
5632     }
5633    
5634     redo A;
5635     } else {
5636     #
5637     }
5638    
5639     ## XML5: No parse error.
5640     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5641     line => $self->{line_prev},
5642     column => $self->{column_prev} - 1);
5643     ## Reconsume.
5644     $self->{state} = BOGUS_COMMENT_STATE;
5645     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5646     redo A;
5647     } elsif ($self->{state} == MD_E_STATE) {
5648 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5649     $self->{nc} == 0x006E) { # n
5650 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5651     $self->{kwd} .= chr $self->{nc};
5652    
5653     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654     $self->{line_prev} = $self->{line};
5655     $self->{column_prev} = $self->{column};
5656     $self->{column}++;
5657     $self->{nc}
5658     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659     } else {
5660     $self->{set_nc}->($self);
5661     }
5662    
5663     redo A;
5664 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5665     $self->{nc} == 0x006C) { # l
5666 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5667     $self->{state} = MD_ELEMENT_STATE;
5668     $self->{kwd} .= chr $self->{nc};
5669    
5670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5671     $self->{line_prev} = $self->{line};
5672     $self->{column_prev} = $self->{column};
5673     $self->{column}++;
5674     $self->{nc}
5675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5676     } else {
5677     $self->{set_nc}->($self);
5678     }
5679    
5680     redo A;
5681     } else {
5682     ## XML5: No parse error.
5683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5684     line => $self->{line_prev},
5685     column => $self->{column_prev} - 2
5686     + 1 * ($self->{nc} == -1));
5687     ## Reconsume.
5688     $self->{state} = BOGUS_COMMENT_STATE;
5689     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5690     redo A;
5691     }
5692     } elsif ($self->{state} == MD_ENTITY_STATE) {
5693 wakaba 1.17 if ($self->{nc} == [
5694     undef,
5695     undef,
5696     0x0054, # T
5697     0x0049, # I
5698     0x0054, # T
5699     ]->[length $self->{kwd}] or
5700     $self->{nc} == [
5701     undef,
5702     undef,
5703     0x0074, # t
5704     0x0069, # i
5705     0x0074, # t
5706     ]->[length $self->{kwd}]) {
5707 wakaba 1.14 ## Stay in the state.
5708     $self->{kwd} .= chr $self->{nc};
5709    
5710     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5711     $self->{line_prev} = $self->{line};
5712     $self->{column_prev} = $self->{column};
5713     $self->{column}++;
5714     $self->{nc}
5715     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5716     } else {
5717     $self->{set_nc}->($self);
5718     }
5719    
5720     redo A;
5721 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5722     ($self->{nc} == 0x0059 or # Y
5723     $self->{nc} == 0x0079)) { # y
5724     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5725     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5726     text => 'ENTITY',
5727     line => $self->{line_prev},
5728     column => $self->{column_prev} - 4);
5729     }
5730     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5731 wakaba 1.14 line => $self->{line_prev},
5732     column => $self->{column_prev} - 6};
5733     $self->{state} = DOCTYPE_MD_STATE;
5734    
5735     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5736     $self->{line_prev} = $self->{line};
5737     $self->{column_prev} = $self->{column};
5738     $self->{column}++;
5739     $self->{nc}
5740     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5741     } else {
5742     $self->{set_nc}->($self);
5743     }
5744    
5745     redo A;
5746     } else {
5747     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5748     line => $self->{line_prev},
5749     column => $self->{column_prev} - 1
5750     - (length $self->{kwd})
5751     + 1 * ($self->{nc} == -1));
5752     $self->{state} = BOGUS_COMMENT_STATE;
5753     ## Reconsume.
5754     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5755     redo A;
5756     }
5757     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5758 wakaba 1.17 if ($self->{nc} == [
5759     undef,
5760     undef,
5761     0x0045, # E
5762     0x004D, # M
5763     0x0045, # E
5764     0x004E, # N
5765     ]->[length $self->{kwd}] or
5766     $self->{nc} == [
5767     undef,
5768     undef,
5769     0x0065, # e
5770     0x006D, # m
5771     0x0065, # e
5772     0x006E, # n
5773     ]->[length $self->{kwd}]) {
5774 wakaba 1.14 ## Stay in the state.
5775     $self->{kwd} .= chr $self->{nc};
5776    
5777     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778     $self->{line_prev} = $self->{line};
5779     $self->{column_prev} = $self->{column};
5780     $self->{column}++;
5781     $self->{nc}
5782     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783     } else {
5784     $self->{set_nc}->($self);
5785     }
5786    
5787     redo A;
5788 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5789     ($self->{nc} == 0x0054 or # T
5790     $self->{nc} == 0x0074)) { # t
5791     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5792     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5793     text => 'ELEMENT',
5794     line => $self->{line_prev},
5795     column => $self->{column_prev} - 5);
5796     }
5797 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5798     line => $self->{line_prev},
5799     column => $self->{column_prev} - 6};
5800     $self->{state} = DOCTYPE_MD_STATE;
5801    
5802     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5803     $self->{line_prev} = $self->{line};
5804     $self->{column_prev} = $self->{column};
5805     $self->{column}++;
5806     $self->{nc}
5807     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5808     } else {
5809     $self->{set_nc}->($self);
5810     }
5811    
5812     redo A;
5813     } else {
5814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5815     line => $self->{line_prev},
5816     column => $self->{column_prev} - 1
5817     - (length $self->{kwd})
5818     + 1 * ($self->{nc} == -1));
5819     $self->{state} = BOGUS_COMMENT_STATE;
5820     ## Reconsume.
5821     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5822     redo A;
5823     }
5824     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5825 wakaba 1.17 if ($self->{nc} == [
5826     undef,
5827     0x0054, # T
5828     0x0054, # T
5829     0x004C, # L
5830     0x0049, # I
5831     0x0053, # S
5832     ]->[length $self->{kwd}] or
5833     $self->{nc} == [
5834     undef,
5835     0x0074, # t
5836     0x0074, # t
5837     0x006C, # l
5838     0x0069, # i
5839     0x0073, # s
5840     ]->[length $self->{kwd}]) {
5841 wakaba 1.14 ## Stay in the state.
5842     $self->{kwd} .= chr $self->{nc};
5843    
5844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5845     $self->{line_prev} = $self->{line};
5846     $self->{column_prev} = $self->{column};
5847     $self->{column}++;
5848     $self->{nc}
5849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5850     } else {
5851     $self->{set_nc}->($self);
5852     }
5853    
5854     redo A;
5855 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5856     ($self->{nc} == 0x0054 or # T
5857     $self->{nc} == 0x0074)) { # t
5858     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5860     text => 'ATTLIST',
5861     line => $self->{line_prev},
5862     column => $self->{column_prev} - 5);
5863     }
5864 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5865 wakaba 1.15 attrdefs => [],
5866 wakaba 1.14 line => $self->{line_prev},
5867     column => $self->{column_prev} - 6};
5868     $self->{state} = DOCTYPE_MD_STATE;
5869    
5870     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5871     $self->{line_prev} = $self->{line};
5872     $self->{column_prev} = $self->{column};
5873     $self->{column}++;
5874     $self->{nc}
5875     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5876     } else {
5877     $self->{set_nc}->($self);
5878     }
5879    
5880     redo A;
5881     } else {
5882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5883     line => $self->{line_prev},
5884     column => $self->{column_prev} - 1
5885     - (length $self->{kwd})
5886     + 1 * ($self->{nc} == -1));
5887     $self->{state} = BOGUS_COMMENT_STATE;
5888     ## Reconsume.
5889     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5890     redo A;
5891     }
5892     } elsif ($self->{state} == MD_NOTATION_STATE) {
5893 wakaba 1.17 if ($self->{nc} == [
5894     undef,
5895     0x004F, # O
5896     0x0054, # T
5897     0x0041, # A
5898     0x0054, # T
5899     0x0049, # I
5900     0x004F, # O
5901     ]->[length $self->{kwd}] or
5902     $self->{nc} == [
5903     undef,
5904     0x006F, # o
5905     0x0074, # t
5906     0x0061, # a
5907     0x0074, # t
5908     0x0069, # i
5909     0x006F, # o
5910     ]->[length $self->{kwd}]) {
5911 wakaba 1.14 ## Stay in the state.
5912     $self->{kwd} .= chr $self->{nc};
5913    
5914     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5915     $self->{line_prev} = $self->{line};
5916     $self->{column_prev} = $self->{column};
5917     $self->{column}++;
5918     $self->{nc}
5919     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5920     } else {
5921     $self->{set_nc}->($self);
5922     }
5923    
5924     redo A;
5925 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
5926     ($self->{nc} == 0x004E or # N
5927     $self->{nc} == 0x006E)) { # n
5928     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5930     text => 'NOTATION',
5931     line => $self->{line_prev},
5932     column => $self->{column_prev} - 6);
5933     }
5934 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
5935     line => $self->{line_prev},
5936     column => $self->{column_prev} - 6};
5937     $self->{state} = DOCTYPE_MD_STATE;
5938    
5939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5940     $self->{line_prev} = $self->{line};
5941     $self->{column_prev} = $self->{column};
5942     $self->{column}++;
5943     $self->{nc}
5944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5945     } else {
5946     $self->{set_nc}->($self);
5947     }
5948    
5949     redo A;
5950     } else {
5951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5952     line => $self->{line_prev},
5953     column => $self->{column_prev} - 1
5954     - (length $self->{kwd})
5955     + 1 * ($self->{nc} == -1));
5956     $self->{state} = BOGUS_COMMENT_STATE;
5957     ## Reconsume.
5958     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5959     redo A;
5960     }
5961     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5962     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5963     ## "DOCTYPE NOTATION state".
5964    
5965     if ($is_space->{$self->{nc}}) {
5966     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5967     $self->{state} = BEFORE_MD_NAME_STATE;
5968    
5969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5970     $self->{line_prev} = $self->{line};
5971     $self->{column_prev} = $self->{column};
5972     $self->{column}++;
5973     $self->{nc}
5974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5975     } else {
5976     $self->{set_nc}->($self);
5977     }
5978    
5979     redo A;
5980     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5981     $self->{nc} == 0x0025) { # %
5982     ## XML5: Switch to the "DOCTYPE bogus comment state".
5983     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5984     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5985    
5986     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5987     $self->{line_prev} = $self->{line};
5988     $self->{column_prev} = $self->{column};
5989     $self->{column}++;
5990     $self->{nc}
5991     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5992     } else {
5993     $self->{set_nc}->($self);
5994     }
5995    
5996     redo A;
5997     } elsif ($self->{nc} == -1) {
5998     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5999     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6000     ## Reconsume.
6001     redo A;
6002     } elsif ($self->{nc} == 0x003E) { # >
6003     ## XML5: Switch to the "DOCTYPE bogus comment state".
6004     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6005     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6006    
6007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6008     $self->{line_prev} = $self->{line};
6009     $self->{column_prev} = $self->{column};
6010     $self->{column}++;
6011     $self->{nc}
6012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6013     } else {
6014     $self->{set_nc}->($self);
6015     }
6016    
6017     redo A;
6018     } else {
6019     ## XML5: Switch to the "DOCTYPE bogus comment state".
6020     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6021     $self->{state} = BEFORE_MD_NAME_STATE;
6022     redo A;
6023     }
6024     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6025     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6026     ## before state", "DOCTYPE ATTLIST name before state".
6027    
6028     if ($is_space->{$self->{nc}}) {
6029     ## Stay in the state.
6030    
6031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032     $self->{line_prev} = $self->{line};
6033     $self->{column_prev} = $self->{column};
6034     $self->{column}++;
6035     $self->{nc}
6036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037     } else {
6038     $self->{set_nc}->($self);
6039     }
6040    
6041     redo A;
6042     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6043     $self->{nc} == 0x0025) { # %
6044     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6045    
6046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047     $self->{line_prev} = $self->{line};
6048     $self->{column_prev} = $self->{column};
6049     $self->{column}++;
6050     $self->{nc}
6051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052     } else {
6053     $self->{set_nc}->($self);
6054     }
6055    
6056     redo A;
6057     } elsif ($self->{nc} == 0x003E) { # >
6058     ## XML5: Same as "Anything else".
6059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6061    
6062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6063     $self->{line_prev} = $self->{line};
6064     $self->{column_prev} = $self->{column};
6065     $self->{column}++;
6066     $self->{nc}
6067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6068     } else {
6069     $self->{set_nc}->($self);
6070     }
6071    
6072     redo A;
6073     } elsif ($self->{nc} == -1) {
6074     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6075     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6076     ## Reconsume.
6077     redo A;
6078     } else {
6079     ## XML5: [ATTLIST] Not defined yet.
6080     $self->{ct}->{name} .= chr $self->{nc};
6081     $self->{state} = MD_NAME_STATE;
6082    
6083     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6084     $self->{line_prev} = $self->{line};
6085     $self->{column_prev} = $self->{column};
6086     $self->{column}++;
6087     $self->{nc}
6088     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6089     } else {
6090     $self->{set_nc}->($self);
6091     }
6092    
6093     redo A;
6094     }
6095     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6096     if ($is_space->{$self->{nc}}) {
6097     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6098     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6099     $self->{state} = BEFORE_MD_NAME_STATE;
6100 wakaba 1.8
6101 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102     $self->{line_prev} = $self->{line};
6103     $self->{column_prev} = $self->{column};
6104     $self->{column}++;
6105     $self->{nc}
6106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107     } else {
6108     $self->{set_nc}->($self);
6109     }
6110    
6111     redo A;
6112     } elsif ($self->{nc} == 0x003E) { # >
6113     ## XML5: Same as "Anything else".
6114     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6115     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6116    
6117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6118     $self->{line_prev} = $self->{line};
6119     $self->{column_prev} = $self->{column};
6120     $self->{column}++;
6121     $self->{nc}
6122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6123     } else {
6124     $self->{set_nc}->($self);
6125     }
6126    
6127     redo A;
6128     } elsif ($self->{nc} == -1) {
6129     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6130     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6131     ## Reconsume.
6132     redo A;
6133     } else {
6134     ## XML5: No parse error.
6135     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6136     $self->{state} = BOGUS_COMMENT_STATE;
6137     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6138     ## Reconsume.
6139     redo A;
6140     }
6141     } elsif ($self->{state} == MD_NAME_STATE) {
6142     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6143    
6144     if ($is_space->{$self->{nc}}) {
6145 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6146     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6147     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6148     ## TODO: ...
6149     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6150     } else { # ENTITY/NOTATION
6151     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6152     }
6153 wakaba 1.14
6154     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6155     $self->{line_prev} = $self->{line};
6156     $self->{column_prev} = $self->{column};
6157     $self->{column}++;
6158     $self->{nc}
6159     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6160     } else {
6161     $self->{set_nc}->($self);
6162     }
6163    
6164     redo A;
6165     } elsif ($self->{nc} == 0x003E) { # >
6166     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6167     #
6168     } else {
6169 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6170 wakaba 1.14 }
6171     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6172    
6173     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6174     $self->{line_prev} = $self->{line};
6175     $self->{column_prev} = $self->{column};
6176     $self->{column}++;
6177     $self->{nc}
6178     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6179     } else {
6180     $self->{set_nc}->($self);
6181     }
6182    
6183     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6184     redo A;
6185     } elsif ($self->{nc} == -1) {
6186     ## XML5: [ATTLIST] No parse error.
6187     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6188     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6189     ## Reconsume.
6190     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6191     redo A;
6192     } else {
6193     ## XML5: [ATTLIST] Not defined yet.
6194     $self->{ct}->{name} .= chr $self->{nc};
6195     ## Stay in the state.
6196    
6197     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6198     $self->{line_prev} = $self->{line};
6199     $self->{column_prev} = $self->{column};
6200     $self->{column}++;
6201     $self->{nc}
6202     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6203     } else {
6204     $self->{set_nc}->($self);
6205     }
6206    
6207     redo A;
6208     }
6209     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6210     if ($is_space->{$self->{nc}}) {
6211     ## Stay in the state.
6212    
6213     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6214     $self->{line_prev} = $self->{line};
6215     $self->{column_prev} = $self->{column};
6216     $self->{column}++;
6217     $self->{nc}
6218     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6219     } else {
6220     $self->{set_nc}->($self);
6221     }
6222    
6223     redo A;
6224     } elsif ($self->{nc} == 0x003E) { # >
6225     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6226    
6227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6228     $self->{line_prev} = $self->{line};
6229     $self->{column_prev} = $self->{column};
6230     $self->{column}++;
6231     $self->{nc}
6232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6233     } else {
6234     $self->{set_nc}->($self);
6235     }
6236    
6237     return ($self->{ct}); # ATTLIST
6238     redo A;
6239     } elsif ($self->{nc} == -1) {
6240     ## XML5: No parse error.
6241     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6242     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6243 wakaba 1.15 return ($self->{ct});
6244 wakaba 1.14 redo A;
6245     } else {
6246     ## XML5: Not defined yet.
6247 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6248     tokens => [],
6249     line => $self->{line}, column => $self->{column}};
6250     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6251    
6252     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6253     $self->{line_prev} = $self->{line};
6254     $self->{column_prev} = $self->{column};
6255     $self->{column}++;
6256     $self->{nc}
6257     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6258     } else {
6259     $self->{set_nc}->($self);
6260     }
6261    
6262     redo A;
6263     }
6264     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6265     if ($is_space->{$self->{nc}}) {
6266     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6267    
6268     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6269     $self->{line_prev} = $self->{line};
6270     $self->{column_prev} = $self->{column};
6271     $self->{column}++;
6272     $self->{nc}
6273     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6274     } else {
6275     $self->{set_nc}->($self);
6276     }
6277    
6278     redo A;
6279     } elsif ($self->{nc} == 0x003E) { # >
6280     ## XML5: Same as "anything else".
6281     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6282     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6283    
6284     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6285     $self->{line_prev} = $self->{line};
6286     $self->{column_prev} = $self->{column};
6287     $self->{column}++;
6288     $self->{nc}
6289     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6290     } else {
6291     $self->{set_nc}->($self);
6292     }
6293    
6294     return ($self->{ct}); # ATTLIST
6295     redo A;
6296     } elsif ($self->{nc} == 0x0028) { # (
6297     ## XML5: Same as "anything else".
6298     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6299     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6300    
6301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6302     $self->{line_prev} = $self->{line};
6303     $self->{column_prev} = $self->{column};
6304     $self->{column}++;
6305     $self->{nc}
6306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6307     } else {
6308     $self->{set_nc}->($self);
6309     }
6310    
6311     redo A;
6312     } elsif ($self->{nc} == -1) {
6313     ## XML5: No parse error.
6314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6315     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6316    
6317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6318     $self->{line_prev} = $self->{line};
6319     $self->{column_prev} = $self->{column};
6320     $self->{column}++;
6321     $self->{nc}
6322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6323     } else {
6324     $self->{set_nc}->($self);
6325     }
6326    
6327     return ($self->{ct}); # ATTLIST
6328     redo A;
6329     } else {
6330     ## XML5: Not defined yet.
6331     $self->{ca}->{name} .= chr $self->{nc};
6332     ## Stay in the state.
6333    
6334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6335     $self->{line_prev} = $self->{line};
6336     $self->{column_prev} = $self->{column};
6337     $self->{column}++;
6338     $self->{nc}
6339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6340     } else {
6341     $self->{set_nc}->($self);
6342     }
6343    
6344 wakaba 1.14 redo A;
6345     }
6346 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6347     if ($is_space->{$self->{nc}}) {
6348     ## Stay in the state.
6349    
6350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6351     $self->{line_prev} = $self->{line};
6352     $self->{column_prev} = $self->{column};
6353     $self->{column}++;
6354     $self->{nc}
6355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6356     } else {
6357     $self->{set_nc}->($self);
6358     }
6359    
6360     redo A;
6361     } elsif ($self->{nc} == 0x003E) { # >
6362     ## XML5: Same as "anything else".
6363     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6364     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6365    
6366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367     $self->{line_prev} = $self->{line};
6368     $self->{column_prev} = $self->{column};
6369     $self->{column}++;
6370     $self->{nc}
6371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372     } else {
6373     $self->{set_nc}->($self);
6374     }
6375    
6376     return ($self->{ct}); # ATTLIST
6377     redo A;
6378     } elsif ($self->{nc} == 0x0028) { # (
6379     ## XML5: Same as "anything else".
6380     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6381    
6382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6383     $self->{line_prev} = $self->{line};
6384     $self->{column_prev} = $self->{column};
6385     $self->{column}++;
6386     $self->{nc}
6387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6388     } else {
6389     $self->{set_nc}->($self);
6390     }
6391    
6392     redo A;
6393     } elsif ($self->{nc} == -1) {
6394     ## XML5: No parse error.
6395     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6396     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6397    
6398     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399     $self->{line_prev} = $self->{line};
6400     $self->{column_prev} = $self->{column};
6401     $self->{column}++;
6402     $self->{nc}
6403     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404     } else {
6405     $self->{set_nc}->($self);
6406     }
6407    
6408     return ($self->{ct});
6409     redo A;
6410     } else {
6411     ## XML5: Not defined yet.
6412     $self->{ca}->{type} = chr $self->{nc};
6413     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6414    
6415     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6416     $self->{line_prev} = $self->{line};
6417     $self->{column_prev} = $self->{column};
6418     $self->{column}++;
6419     $self->{nc}
6420     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6421     } else {
6422     $self->{set_nc}->($self);
6423     }
6424    
6425     redo A;
6426     }
6427     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6428     if ($is_space->{$self->{nc}}) {
6429     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6430    
6431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6432     $self->{line_prev} = $self->{line};
6433     $self->{column_prev} = $self->{column};
6434     $self->{column}++;
6435     $self->{nc}
6436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6437     } else {
6438     $self->{set_nc}->($self);
6439     }
6440    
6441     redo A;
6442     } elsif ($self->{nc} == 0x0023) { # #
6443     ## XML5: Same as "anything else".
6444     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6445     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6446    
6447     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6448     $self->{line_prev} = $self->{line};
6449     $self->{column_prev} = $self->{column};
6450     $self->{column}++;
6451     $self->{nc}
6452     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6453     } else {
6454     $self->{set_nc}->($self);
6455     }
6456    
6457     redo A;
6458     } elsif ($self->{nc} == 0x0022) { # "
6459     ## XML5: Same as "anything else".
6460     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6461     $self->{ca}->{value} = '';
6462     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6463    
6464     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6465     $self->{line_prev} = $self->{line};
6466     $self->{column_prev} = $self->{column};
6467     $self->{column}++;
6468     $self->{nc}
6469     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6470     } else {
6471     $self->{set_nc}->($self);
6472     }
6473    
6474     redo A;
6475     } elsif ($self->{nc} == 0x0027) { # '
6476     ## XML5: Same as "anything else".
6477     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6478     $self->{ca}->{value} = '';
6479     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6480    
6481     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6482     $self->{line_prev} = $self->{line};
6483     $self->{column_prev} = $self->{column};
6484     $self->{column}++;
6485     $self->{nc}
6486     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6487     } else {
6488     $self->{set_nc}->($self);
6489     }
6490    
6491     redo A;
6492     } elsif ($self->{nc} == 0x003E) { # >
6493     ## XML5: Same as "anything else".
6494     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6495     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6496    
6497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6498     $self->{line_prev} = $self->{line};
6499     $self->{column_prev} = $self->{column};
6500     $self->{column}++;
6501     $self->{nc}
6502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6503     } else {
6504     $self->{set_nc}->($self);
6505     }
6506    
6507     return ($self->{ct}); # ATTLIST
6508     redo A;
6509     } elsif ($self->{nc} == 0x0028) { # (
6510     ## XML5: Same as "anything else".
6511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6512     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6513    
6514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515     $self->{line_prev} = $self->{line};
6516     $self->{column_prev} = $self->{column};
6517     $self->{column}++;
6518     $self->{nc}
6519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520     } else {
6521     $self->{set_nc}->($self);
6522     }
6523    
6524     redo A;
6525     } elsif ($self->{nc} == -1) {
6526     ## XML5: No parse error.
6527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6528     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6529    
6530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6531     $self->{line_prev} = $self->{line};
6532     $self->{column_prev} = $self->{column};
6533     $self->{column}++;
6534     $self->{nc}
6535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6536     } else {
6537     $self->{set_nc}->($self);
6538     }
6539    
6540     return ($self->{ct});
6541     redo A;
6542     } else {
6543     ## XML5: Not defined yet.
6544     $self->{ca}->{type} .= chr $self->{nc};
6545     ## Stay in the state.
6546    
6547     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548     $self->{line_prev} = $self->{line};
6549     $self->{column_prev} = $self->{column};
6550     $self->{column}++;
6551     $self->{nc}
6552     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553     } else {
6554     $self->{set_nc}->($self);
6555     }
6556    
6557     redo A;
6558     }
6559     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6560     if ($is_space->{$self->{nc}}) {
6561     ## Stay in the state.
6562    
6563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6564     $self->{line_prev} = $self->{line};
6565     $self->{column_prev} = $self->{column};
6566     $self->{column}++;
6567     $self->{nc}
6568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6569     } else {
6570     $self->{set_nc}->($self);
6571     }
6572    
6573     redo A;
6574     } elsif ($self->{nc} == 0x0028) { # (
6575     ## XML5: Same as "anything else".
6576     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6577    
6578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6579     $self->{line_prev} = $self->{line};
6580     $self->{column_prev} = $self->{column};
6581     $self->{column}++;
6582     $self->{nc}
6583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6584     } else {
6585     $self->{set_nc}->($self);
6586     }
6587    
6588     redo A;
6589     } elsif ($self->{nc} == 0x0023) { # #
6590     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6591    
6592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6593     $self->{line_prev} = $self->{line};
6594     $self->{column_prev} = $self->{column};
6595     $self->{column}++;
6596     $self->{nc}
6597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6598     } else {
6599     $self->{set_nc}->($self);
6600     }
6601    
6602     redo A;
6603     } elsif ($self->{nc} == 0x0022) { # "
6604     ## XML5: Same as "anything else".
6605     $self->{ca}->{value} = '';
6606     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6607    
6608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6609     $self->{line_prev} = $self->{line};
6610     $self->{column_prev} = $self->{column};
6611     $self->{column}++;
6612     $self->{nc}
6613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6614     } else {
6615     $self->{set_nc}->($self);
6616     }
6617    
6618     redo A;
6619     } elsif ($self->{nc} == 0x0027) { # '
6620     ## XML5: Same as "anything else".
6621     $self->{ca}->{value} = '';
6622     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6623    
6624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6625     $self->{line_prev} = $self->{line};
6626     $self->{column_prev} = $self->{column};
6627     $self->{column}++;
6628     $self->{nc}
6629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6630     } else {
6631     $self->{set_nc}->($self);
6632     }
6633    
6634     redo A;
6635     } elsif ($self->{nc} == 0x003E) { # >
6636     ## XML5: Same as "anything else".
6637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6638     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6639    
6640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6641     $self->{line_prev} = $self->{line};
6642     $self->{column_prev} = $self->{column};
6643     $self->{column}++;
6644     $self->{nc}
6645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6646     } else {
6647     $self->{set_nc}->($self);
6648     }
6649    
6650     return ($self->{ct}); # ATTLIST
6651     redo A;
6652     } elsif ($self->{nc} == -1) {
6653     ## XML5: No parse error.
6654     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6655     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6656    
6657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6658     $self->{line_prev} = $self->{line};
6659     $self->{column_prev} = $self->{column};
6660     $self->{column}++;
6661     $self->{nc}
6662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6663     } else {
6664     $self->{set_nc}->($self);
6665     }
6666    
6667     return ($self->{ct});
6668     redo A;
6669     } else {
6670     ## XML5: Switch to the "DOCTYPE bogus comment state".
6671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6672     $self->{ca}->{value} = '';
6673     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6674     ## Reconsume.
6675     redo A;
6676     }
6677     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6678     if ($is_space->{$self->{nc}}) {
6679     ## Stay in the state.
6680    
6681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6682     $self->{line_prev} = $self->{line};
6683     $self->{column_prev} = $self->{column};
6684     $self->{column}++;
6685     $self->{nc}
6686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6687     } else {
6688     $self->{set_nc}->($self);
6689     }
6690    
6691     redo A;
6692     } elsif ($self->{nc} == 0x007C) { # |
6693     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6694     ## Stay in the state.
6695    
6696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6697     $self->{line_prev} = $self->{line};
6698     $self->{column_prev} = $self->{column};
6699     $self->{column}++;
6700     $self->{nc}
6701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6702     } else {
6703     $self->{set_nc}->($self);
6704     }
6705    
6706     redo A;
6707     } elsif ($self->{nc} == 0x0029) { # )
6708     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6709     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6710    
6711     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712     $self->{line_prev} = $self->{line};
6713     $self->{column_prev} = $self->{column};
6714     $self->{column}++;
6715     $self->{nc}
6716     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717     } else {
6718     $self->{set_nc}->($self);
6719     }
6720    
6721     redo A;
6722     } elsif ($self->{nc} == 0x003E) { # >
6723     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6724     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6725    
6726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6727     $self->{line_prev} = $self->{line};
6728     $self->{column_prev} = $self->{column};
6729     $self->{column}++;
6730     $self->{nc}
6731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6732     } else {
6733     $self->{set_nc}->($self);
6734     }
6735    
6736     return ($self->{ct}); # ATTLIST
6737     redo A;
6738     } elsif ($self->{nc} == -1) {
6739     ## XML5: No parse error.
6740     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6741     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6742    
6743     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6744     $self->{line_prev} = $self->{line};
6745     $self->{column_prev} = $self->{column};
6746     $self->{column}++;
6747     $self->{nc}
6748     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6749     } else {
6750     $self->{set_nc}->($self);
6751     }
6752    
6753     return ($self->{ct});
6754     redo A;
6755     } else {
6756     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6757     $self->{state} = ALLOWED_TOKEN_STATE;
6758    
6759     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760     $self->{line_prev} = $self->{line};
6761     $self->{column_prev} = $self->{column};
6762     $self->{column}++;
6763     $self->{nc}
6764     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765     } else {
6766     $self->{set_nc}->($self);
6767     }
6768    
6769     redo A;
6770     }
6771     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6772     if ($is_space->{$self->{nc}}) {
6773     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6774    
6775     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6776     $self->{line_prev} = $self->{line};
6777     $self->{column_prev} = $self->{column};
6778     $self->{column}++;
6779     $self->{nc}
6780     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6781     } else {
6782     $self->{set_nc}->($self);
6783     }
6784    
6785     redo A;
6786     } elsif ($self->{nc} == 0x007C) { # |
6787     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6788    
6789     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790     $self->{line_prev} = $self->{line};
6791     $self->{column_prev} = $self->{column};
6792     $self->{column}++;
6793     $self->{nc}
6794     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795     } else {
6796     $self->{set_nc}->($self);
6797     }
6798    
6799     redo A;
6800     } elsif ($self->{nc} == 0x0029) { # )
6801     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6802    
6803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6804     $self->{line_prev} = $self->{line};
6805     $self->{column_prev} = $self->{column};
6806     $self->{column}++;
6807     $self->{nc}
6808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6809     } else {
6810     $self->{set_nc}->($self);
6811     }
6812    
6813     redo A;
6814     } elsif ($self->{nc} == 0x003E) { # >
6815     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6816     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6817    
6818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6819     $self->{line_prev} = $self->{line};
6820     $self->{column_prev} = $self->{column};
6821     $self->{column}++;
6822     $self->{nc}
6823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6824     } else {
6825     $self->{set_nc}->($self);
6826     }
6827    
6828     return ($self->{ct}); # ATTLIST
6829     redo A;
6830     } elsif ($self->{nc} == -1) {
6831     ## XML5: No parse error.
6832     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6833     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6834    
6835     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6836     $self->{line_prev} = $self->{line};
6837     $self->{column_prev} = $self->{column};
6838     $self->{column}++;
6839     $self->{nc}
6840     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6841     } else {
6842     $self->{set_nc}->($self);
6843     }
6844    
6845     return ($self->{ct});
6846     redo A;
6847     } else {
6848     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6849     ## Stay in the state.
6850    
6851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6852     $self->{line_prev} = $self->{line};
6853     $self->{column_prev} = $self->{column};
6854     $self->{column}++;
6855     $self->{nc}
6856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6857     } else {
6858     $self->{set_nc}->($self);
6859     }
6860    
6861     redo A;
6862     }
6863     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6864     if ($is_space->{$self->{nc}}) {
6865     ## Stay in the state.
6866    
6867     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6868     $self->{line_prev} = $self->{line};
6869     $self->{column_prev} = $self->{column};
6870     $self->{column}++;
6871     $self->{nc}
6872     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6873     } else {
6874     $self->{set_nc}->($self);
6875     }
6876    
6877     redo A;
6878     } elsif ($self->{nc} == 0x007C) { # |
6879     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6880    
6881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6882     $self->{line_prev} = $self->{line};
6883     $self->{column_prev} = $self->{column};
6884     $self->{column}++;
6885     $self->{nc}
6886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6887     } else {
6888     $self->{set_nc}->($self);
6889     }
6890    
6891     redo A;
6892     } elsif ($self->{nc} == 0x0029) { # )
6893     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6894    
6895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896     $self->{line_prev} = $self->{line};
6897     $self->{column_prev} = $self->{column};
6898     $self->{column}++;
6899     $self->{nc}
6900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901     } else {
6902     $self->{set_nc}->($self);
6903     }
6904    
6905     redo A;
6906     } elsif ($self->{nc} == 0x003E) { # >
6907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6908     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6909    
6910     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6911     $self->{line_prev} = $self->{line};
6912     $self->{column_prev} = $self->{column};
6913     $self->{column}++;
6914     $self->{nc}
6915     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6916     } else {
6917     $self->{set_nc}->($self);
6918     }
6919    
6920     return ($self->{ct}); # ATTLIST
6921     redo A;
6922     } elsif ($self->{nc} == -1) {
6923     ## XML5: No parse error.
6924     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6925     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6926    
6927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928     $self->{line_prev} = $self->{line};
6929     $self->{column_prev} = $self->{column};
6930     $self->{column}++;
6931     $self->{nc}
6932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933     } else {
6934     $self->{set_nc}->($self);
6935     }
6936    
6937     return ($self->{ct});
6938     redo A;
6939     } else {
6940     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6941     line => $self->{line_prev},
6942     column => $self->{column_prev});
6943     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6944     $self->{state} = ALLOWED_TOKEN_STATE;
6945    
6946     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6947     $self->{line_prev} = $self->{line};
6948     $self->{column_prev} = $self->{column};
6949     $self->{column}++;
6950     $self->{nc}
6951     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6952     } else {
6953     $self->{set_nc}->($self);
6954     }
6955    
6956     redo A;
6957     }
6958     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6959     if ($is_space->{$self->{nc}}) {
6960     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6961    
6962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6963     $self->{line_prev} = $self->{line};
6964     $self->{column_prev} = $self->{column};
6965     $self->{column}++;
6966     $self->{nc}
6967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6968     } else {
6969     $self->{set_nc}->($self);
6970     }
6971    
6972     redo A;
6973     } elsif ($self->{nc} == 0x0023) { # #
6974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6975     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6976    
6977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6978     $self->{line_prev} = $self->{line};
6979     $self->{column_prev} = $self->{column};
6980     $self->{column}++;
6981     $self->{nc}
6982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6983     } else {
6984     $self->{set_nc}->($self);
6985     }
6986    
6987     redo A;
6988     } elsif ($self->{nc} == 0x0022) { # "
6989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6990     $self->{ca}->{value} = '';
6991     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6992    
6993     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6994     $self->{line_prev} = $self->{line};
6995     $self->{column_prev} = $self->{column};
6996     $self->{column}++;
6997     $self->{nc}
6998     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6999     } else {
7000     $self->{set_nc}->($self);
7001     }
7002    
7003     redo A;
7004     } elsif ($self->{nc} == 0x0027) { # '
7005     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7006     $self->{ca}->{value} = '';
7007     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7008    
7009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010     $self->{line_prev} = $self->{line};
7011     $self->{column_prev} = $self->{column};
7012     $self->{column}++;
7013     $self->{nc}
7014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015     } else {
7016     $self->{set_nc}->($self);
7017     }
7018    
7019     redo A;
7020     } elsif ($self->{nc} == 0x003E) { # >
7021     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7022     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7023    
7024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7025     $self->{line_prev} = $self->{line};
7026     $self->{column_prev} = $self->{column};
7027     $self->{column}++;
7028     $self->{nc}
7029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7030     } else {
7031     $self->{set_nc}->($self);
7032     }
7033    
7034     return ($self->{ct}); # ATTLIST
7035     redo A;
7036     } elsif ($self->{nc} == -1) {
7037     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7038     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7039    
7040     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7041     $self->{line_prev} = $self->{line};
7042     $self->{column_prev} = $self->{column};
7043     $self->{column}++;
7044     $self->{nc}
7045     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7046     } else {
7047     $self->{set_nc}->($self);
7048     }
7049    
7050     return ($self->{ct});
7051     redo A;
7052     } else {
7053     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7054     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7055     ## Reconsume.
7056     redo A;
7057     }
7058     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7059     if ($is_space->{$self->{nc}}) {
7060     ## Stay in the state.
7061    
7062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063     $self->{line_prev} = $self->{line};
7064     $self->{column_prev} = $self->{column};
7065     $self->{column}++;
7066     $self->{nc}
7067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068     } else {
7069     $self->{set_nc}->($self);
7070     }
7071    
7072     redo A;
7073     } elsif ($self->{nc} == 0x0023) { # #
7074     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7075    
7076     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077     $self->{line_prev} = $self->{line};
7078     $self->{column_prev} = $self->{column};
7079     $self->{column}++;
7080     $self->{nc}
7081     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082     } else {
7083     $self->{set_nc}->($self);
7084     }
7085    
7086     redo A;
7087     } elsif ($self->{nc} == 0x0022) { # "
7088     $self->{ca}->{value} = '';
7089     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7090    
7091     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7092     $self->{line_prev} = $self->{line};
7093     $self->{column_prev} = $self->{column};
7094     $self->{column}++;
7095     $self->{nc}
7096     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7097     } else {
7098     $self->{set_nc}->($self);
7099     }
7100    
7101     redo A;
7102     } elsif ($self->{nc} == 0x0027) { # '
7103     $self->{ca}->{value} = '';
7104     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7105    
7106     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7107     $self->{line_prev} = $self->{line};
7108     $self->{column_prev} = $self->{column};
7109     $self->{column}++;
7110     $self->{nc}
7111     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7112     } else {
7113     $self->{set_nc}->($self);
7114     }
7115    
7116     redo A;
7117     } elsif ($self->{nc} == 0x003E) { # >
7118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7119     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7120    
7121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122     $self->{line_prev} = $self->{line};
7123     $self->{column_prev} = $self->{column};
7124     $self->{column}++;
7125     $self->{nc}
7126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127     } else {
7128     $self->{set_nc}->($self);
7129     }
7130    
7131     return ($self->{ct}); # ATTLIST
7132     redo A;
7133     } elsif ($self->{nc} == -1) {
7134     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7135     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7136    
7137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7138     $self->{line_prev} = $self->{line};
7139     $self->{column_prev} = $self->{column};
7140     $self->{column}++;
7141     $self->{nc}
7142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7143     } else {
7144     $self->{set_nc}->($self);
7145     }
7146    
7147     return ($self->{ct});
7148     redo A;
7149     } else {
7150     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7151     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7152     ## Reconsume.
7153     redo A;
7154     }
7155     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7156     if ($is_space->{$self->{nc}}) {
7157     ## XML5: No parse error.
7158     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7159 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7160 wakaba 1.15 ## Reconsume.
7161     redo A;
7162     } elsif ($self->{nc} == 0x0022) { # "
7163     ## XML5: Same as "anything else".
7164     $self->{ca}->{value} = '';
7165     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7166    
7167     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7168     $self->{line_prev} = $self->{line};
7169     $self->{column_prev} = $self->{column};
7170     $self->{column}++;
7171     $self->{nc}
7172     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7173     } else {
7174     $self->{set_nc}->($self);
7175     }
7176    
7177     redo A;
7178     } elsif ($self->{nc} == 0x0027) { # '
7179     ## XML5: Same as "anything else".
7180     $self->{ca}->{value} = '';
7181     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7182    
7183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7184     $self->{line_prev} = $self->{line};
7185     $self->{column_prev} = $self->{column};
7186     $self->{column}++;
7187     $self->{nc}
7188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7189     } else {
7190     $self->{set_nc}->($self);
7191     }
7192    
7193     redo A;
7194     } elsif ($self->{nc} == 0x003E) { # >
7195     ## XML5: Same as "anything else".
7196     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7197     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7198    
7199     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7200     $self->{line_prev} = $self->{line};
7201     $self->{column_prev} = $self->{column};
7202     $self->{column}++;
7203     $self->{nc}
7204     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7205     } else {
7206     $self->{set_nc}->($self);
7207     }
7208    
7209     return ($self->{ct}); # ATTLIST
7210     redo A;
7211     } elsif ($self->{nc} == -1) {
7212     ## XML5: No parse error.
7213     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7214     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7215    
7216     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7217     $self->{line_prev} = $self->{line};
7218     $self->{column_prev} = $self->{column};
7219     $self->{column}++;
7220     $self->{nc}
7221     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7222     } else {
7223     $self->{set_nc}->($self);
7224     }
7225    
7226     return ($self->{ct});
7227     redo A;
7228     } else {
7229     $self->{ca}->{default} = chr $self->{nc};
7230     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7231    
7232     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7233     $self->{line_prev} = $self->{line};
7234     $self->{column_prev} = $self->{column};
7235     $self->{column}++;
7236     $self->{nc}
7237     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7238     } else {
7239     $self->{set_nc}->($self);
7240     }
7241    
7242     redo A;
7243     }
7244     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7245     if ($is_space->{$self->{nc}}) {
7246     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7247    
7248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7249     $self->{line_prev} = $self->{line};
7250     $self->{column_prev} = $self->{column};
7251     $self->{column}++;
7252     $self->{nc}
7253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7254     } else {
7255     $self->{set_nc}->($self);
7256     }
7257    
7258     redo A;
7259     } elsif ($self->{nc} == 0x0022) { # "
7260     ## XML5: Same as "anything else".
7261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7262     $self->{ca}->{value} = '';
7263     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7264    
7265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7266     $self->{line_prev} = $self->{line};
7267     $self->{column_prev} = $self->{column};
7268     $self->{column}++;
7269     $self->{nc}
7270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7271     } else {
7272     $self->{set_nc}->($self);
7273     }
7274    
7275     redo A;
7276     } elsif ($self->{nc} == 0x0027) { # '
7277     ## XML5: Same as "anything else".
7278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7279     $self->{ca}->{value} = '';
7280     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7281    
7282     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7283     $self->{line_prev} = $self->{line};
7284     $self->{column_prev} = $self->{column};
7285     $self->{column}++;
7286     $self->{nc}
7287     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7288     } else {
7289     $self->{set_nc}->($self);
7290     }
7291    
7292     redo A;
7293     } elsif ($self->{nc} == 0x003E) { # >
7294     ## XML5: Same as "anything else".
7295     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7296     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7297    
7298     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7299     $self->{line_prev} = $self->{line};
7300     $self->{column_prev} = $self->{column};
7301     $self->{column}++;
7302     $self->{nc}
7303     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7304     } else {
7305     $self->{set_nc}->($self);
7306     }
7307    
7308     return ($self->{ct}); # ATTLIST
7309     redo A;
7310     } elsif ($self->{nc} == -1) {
7311     ## XML5: No parse error.
7312     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7313     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7314     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7315    
7316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7317     $self->{line_prev} = $self->{line};
7318     $self->{column_prev} = $self->{column};
7319     $self->{column}++;
7320     $self->{nc}
7321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7322     } else {
7323     $self->{set_nc}->($self);
7324     }
7325    
7326     return ($self->{ct});
7327     redo A;
7328     } else {
7329     $self->{ca}->{default} .= chr $self->{nc};
7330     ## Stay in the state.
7331    
7332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7333     $self->{line_prev} = $self->{line};
7334     $self->{column_prev} = $self->{column};
7335     $self->{column}++;
7336     $self->{nc}
7337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7338     } else {
7339     $self->{set_nc}->($self);
7340     }
7341    
7342     redo A;
7343     }
7344     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7345     if ($is_space->{$self->{nc}}) {
7346     ## Stay in the state.
7347    
7348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7349     $self->{line_prev} = $self->{line};
7350     $self->{column_prev} = $self->{column};
7351     $self->{column}++;
7352     $self->{nc}
7353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7354     } else {
7355     $self->{set_nc}->($self);
7356     }
7357    
7358     redo A;
7359     } elsif ($self->{nc} == 0x0022) { # "
7360     $self->{ca}->{value} = '';
7361     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7362    
7363     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7364     $self->{line_prev} = $self->{line};
7365     $self->{column_prev} = $self->{column};
7366     $self->{column}++;
7367     $self->{nc}
7368     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7369     } else {
7370     $self->{set_nc}->($self);
7371     }
7372    
7373     redo A;
7374     } elsif ($self->{nc} == 0x0027) { # '
7375     $self->{ca}->{value} = '';
7376     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7377    
7378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7379     $self->{line_prev} = $self->{line};
7380     $self->{column_prev} = $self->{column};
7381     $self->{column}++;
7382     $self->{nc}
7383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7384     } else {
7385     $self->{set_nc}->($self);
7386     }
7387    
7388     redo A;
7389     } elsif ($self->{nc} == 0x003E) { # >
7390     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7391     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7392    
7393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7394     $self->{line_prev} = $self->{line};
7395     $self->{column_prev} = $self->{column};
7396     $self->{column}++;
7397     $self->{nc}
7398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7399     } else {
7400     $self->{set_nc}->($self);
7401     }
7402    
7403     return ($self->{ct}); # ATTLIST
7404     redo A;
7405     } elsif ($self->{nc} == -1) {
7406     ## XML5: No parse error.
7407     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7408     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7409     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7410    
7411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7412     $self->{line_prev} = $self->{line};
7413     $self->{column_prev} = $self->{column};
7414     $self->{column}++;
7415     $self->{nc}
7416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7417     } else {
7418     $self->{set_nc}->($self);
7419     }
7420    
7421     return ($self->{ct});
7422     redo A;
7423     } else {
7424     ## XML5: Not defined yet.
7425     if ($self->{ca}->{default} eq 'FIXED') {
7426     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7427     } else {
7428     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7429     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7430     }
7431     ## Reconsume.
7432     redo A;
7433     }
7434     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7435     if ($is_space->{$self->{nc}} or
7436     $self->{nc} == -1 or
7437     $self->{nc} == 0x003E) { # >
7438     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7439     ## Reconsume.
7440     redo A;
7441     } else {
7442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7443     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7444     ## Reconsume.
7445     redo A;
7446 wakaba 1.16 }
7447 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7448     ## ASCII case-insensitive
7449     if ($self->{nc} == [
7450     undef,
7451     0x0044, # D
7452     0x0041, # A
7453     0x0054, # T
7454     ]->[length $self->{kwd}] or
7455     $self->{nc} == [
7456     undef,
7457     0x0064, # d
7458     0x0061, # a
7459     0x0074, # t
7460     ]->[length $self->{kwd}]) {
7461    
7462     ## Stay in the state.
7463     $self->{kwd} .= chr $self->{nc};
7464    
7465     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7466     $self->{line_prev} = $self->{line};
7467     $self->{column_prev} = $self->{column};
7468     $self->{column}++;
7469     $self->{nc}
7470     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7471     } else {
7472     $self->{set_nc}->($self);
7473     }
7474    
7475     redo A;
7476     } elsif ((length $self->{kwd}) == 4 and
7477     ($self->{nc} == 0x0041 or # A
7478     $self->{nc} == 0x0061)) { # a
7479     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7480    
7481     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7482     text => 'NDATA',
7483     line => $self->{line_prev},
7484     column => $self->{column_prev} - 4);
7485     } else {
7486    
7487     }
7488     $self->{state} = AFTER_NDATA_STATE;
7489    
7490     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7491     $self->{line_prev} = $self->{line};
7492     $self->{column_prev} = $self->{column};
7493     $self->{column}++;
7494     $self->{nc}
7495     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7496     } else {
7497     $self->{set_nc}->($self);
7498     }
7499    
7500     redo A;
7501     } else {
7502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7503     line => $self->{line_prev},
7504     column => $self->{column_prev} + 1
7505     - length $self->{kwd});
7506    
7507     $self->{state} = BOGUS_MD_STATE;
7508     ## Reconsume.
7509     redo A;
7510     }
7511     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7512     if ($is_space->{$self->{nc}}) {
7513     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7514    
7515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516     $self->{line_prev} = $self->{line};
7517     $self->{column_prev} = $self->{column};
7518     $self->{column}++;
7519     $self->{nc}
7520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521     } else {
7522     $self->{set_nc}->($self);
7523     }
7524    
7525     redo A;
7526     } elsif ($self->{nc} == 0x003E) { # >
7527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7528     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7529    
7530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531     $self->{line_prev} = $self->{line};
7532     $self->{column_prev} = $self->{column};
7533     $self->{column}++;
7534     $self->{nc}
7535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536     } else {
7537     $self->{set_nc}->($self);
7538     }
7539    
7540     return ($self->{ct}); # ENTITY
7541     redo A;
7542     } elsif ($self->{nc} == -1) {
7543     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7544     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545    
7546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547     $self->{line_prev} = $self->{line};
7548     $self->{column_prev} = $self->{column};
7549     $self->{column}++;
7550     $self->{nc}
7551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552     } else {
7553     $self->{set_nc}->($self);
7554     }
7555    
7556     return ($self->{ct}); # ENTITY
7557     redo A;
7558     } else {
7559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7560     line => $self->{line_prev},
7561     column => $self->{column_prev} + 1
7562     - length $self->{kwd});
7563     $self->{state} = BOGUS_MD_STATE;
7564     ## Reconsume.
7565     redo A;
7566     }
7567     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7568     if ($is_space->{$self->{nc}}) {
7569     ## Stay in the state.
7570    
7571     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572     $self->{line_prev} = $self->{line};
7573     $self->{column_prev} = $self->{column};
7574     $self->{column}++;
7575     $self->{nc}
7576     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577     } else {
7578     $self->{set_nc}->($self);
7579     }
7580    
7581     redo A;
7582     } elsif ($self->{nc} == 0x003E) { # >
7583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7584     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7585    
7586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7587     $self->{line_prev} = $self->{line};
7588     $self->{column_prev} = $self->{column};
7589     $self->{column}++;
7590     $self->{nc}
7591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7592     } else {
7593     $self->{set_nc}->($self);
7594     }
7595    
7596     return ($self->{ct}); # ENTITY
7597     redo A;
7598     } elsif ($self->{nc} == -1) {
7599     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7600     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7601    
7602     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7603     $self->{line_prev} = $self->{line};
7604     $self->{column_prev} = $self->{column};
7605     $self->{column}++;
7606     $self->{nc}
7607     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7608     } else {
7609     $self->{set_nc}->($self);
7610     }
7611    
7612     return ($self->{ct}); # ENTITY
7613     redo A;
7614     } else {
7615     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7616     $self->{state} = NOTATION_NAME_STATE;
7617    
7618     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619     $self->{line_prev} = $self->{line};
7620     $self->{column_prev} = $self->{column};
7621     $self->{column}++;
7622     $self->{nc}
7623     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624     } else {
7625     $self->{set_nc}->($self);
7626     }
7627    
7628     redo A;
7629     }
7630     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7631     if ($is_space->{$self->{nc}}) {
7632     $self->{state} = AFTER_NOTATION_NAME_STATE;
7633    
7634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7635     $self->{line_prev} = $self->{line};
7636     $self->{column_prev} = $self->{column};
7637     $self->{column}++;
7638     $self->{nc}
7639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7640     } else {
7641     $self->{set_nc}->($self);
7642     }
7643    
7644     redo A;
7645     } elsif ($self->{nc} == 0x003E) { # >
7646     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7647    
7648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7649     $self->{line_prev} = $self->{line};
7650     $self->{column_prev} = $self->{column};
7651     $self->{column}++;
7652     $self->{nc}
7653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7654     } else {
7655     $self->{set_nc}->($self);
7656     }
7657    
7658     return ($self->{ct}); # ENTITY
7659     redo A;
7660     } elsif ($self->{nc} == -1) {
7661     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7662     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7663    
7664     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7665     $self->{line_prev} = $self->{line};
7666     $self->{column_prev} = $self->{column};
7667     $self->{column}++;
7668     $self->{nc}
7669     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7670     } else {
7671     $self->{set_nc}->($self);
7672     }
7673    
7674     return ($self->{ct}); # ENTITY
7675     redo A;
7676     } else {
7677     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7678     ## Stay in the state.
7679    
7680     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7681     $self->{line_prev} = $self->{line};
7682     $self->{column_prev} = $self->{column};
7683     $self->{column}++;
7684     $self->{nc}
7685     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7686     } else {
7687     $self->{set_nc}->($self);
7688     }
7689    
7690     redo A;
7691     }
7692     } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7693     if ($is_space->{$self->{nc}}) {
7694     ## Stay in the state.
7695    
7696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7697     $self->{line_prev} = $self->{line};
7698     $self->{column_prev} = $self->{column};
7699     $self->{column}++;
7700     $self->{nc}
7701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7702     } else {
7703     $self->{set_nc}->($self);
7704     }
7705    
7706     redo A;
7707     } elsif ($self->{nc} == 0x003E) { # >
7708     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7709    
7710     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7711     $self->{line_prev} = $self->{line};
7712     $self->{column_prev} = $self->{column};
7713     $self->{column}++;
7714     $self->{nc}
7715     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7716     } else {
7717     $self->{set_nc}->($self);
7718     }
7719    
7720     return ($self->{ct}); # ENTITY
7721     redo A;
7722     } elsif ($self->{nc} == -1) {
7723     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7724     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7725    
7726     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7727     $self->{line_prev} = $self->{line};
7728     $self->{column_prev} = $self->{column};
7729     $self->{column}++;
7730     $self->{nc}
7731     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7732     } else {
7733     $self->{set_nc}->($self);
7734     }
7735    
7736     return ($self->{ct}); # ENTITY
7737     redo A;
7738     } else {
7739     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7740     $self->{state} = BOGUS_MD_STATE;
7741     ## Reconsume.
7742     redo A;
7743     }
7744    
7745 wakaba 1.16
7746     } elsif ($self->{state} == BOGUS_MD_STATE) {
7747     if ($self->{nc} == 0x003E) { # >
7748     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7749    
7750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7751     $self->{line_prev} = $self->{line};
7752     $self->{column_prev} = $self->{column};
7753     $self->{column}++;
7754     $self->{nc}
7755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7756     } else {
7757     $self->{set_nc}->($self);
7758     }
7759    
7760     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7761     redo A;
7762     } elsif ($self->{nc} == -1) {
7763     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7764     ## Reconsume.
7765     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7766     redo A;
7767     } else {
7768     ## Stay in the state.
7769    
7770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771     $self->{line_prev} = $self->{line};
7772     $self->{column_prev} = $self->{column};
7773     $self->{column}++;
7774     $self->{nc}
7775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776     } else {
7777     $self->{set_nc}->($self);
7778     }
7779    
7780     redo A;
7781     }
7782 wakaba 1.1 } else {
7783     die "$0: $self->{state}: Unknown state";
7784     }
7785     } # A
7786    
7787     die "$0: _get_next_token: unexpected case";
7788     } # _get_next_token
7789    
7790     1;
7791 wakaba 1.18 ## $Date: 2008/10/19 04:39:25 $
7792 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24