/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (hide annotations) (download)
Sun Oct 19 07:19:00 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.18: +191 -6 lines
++ whatpm/t/ChangeLog	19 Oct 2008 07:18:24 -0000
	* XML-Parser.t: Typo fixed.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 07:18:52 -0000
	* entities-1.dat, entities-2.dat: EntityValue tests added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	19 Oct 2008 07:17:36 -0000
	* NanoDOM.pm (Entity->new): Initialize ->child_nodes as an empty
	array.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 07:18:01 -0000
	* Tokenizer.pm.src: Support for EntityValue.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.19 our $VERSION=do{my @r=(q$Revision: 1.18 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18 wakaba 1.13 END_OF_DOCTYPE_TOKEN
19 wakaba 1.14 ATTLIST_TOKEN
20     ELEMENT_TOKEN
21     GENERAL_ENTITY_TOKEN
22     PARAMETER_ENTITY_TOKEN
23     NOTATION_TOKEN
24 wakaba 1.2 );
25    
26     our %EXPORT_TAGS = (
27     token => [qw(
28     DOCTYPE_TOKEN
29     COMMENT_TOKEN
30     START_TAG_TOKEN
31     END_TAG_TOKEN
32     END_OF_FILE_TOKEN
33     CHARACTER_TOKEN
34     PI_TOKEN
35     ABORT_TOKEN
36 wakaba 1.13 END_OF_DOCTYPE_TOKEN
37 wakaba 1.14 ATTLIST_TOKEN
38     ELEMENT_TOKEN
39     GENERAL_ENTITY_TOKEN
40     PARAMETER_ENTITY_TOKEN
41     NOTATION_TOKEN
42 wakaba 1.2 )],
43     );
44     }
45    
46 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48 wakaba 1.2 ## Token types
49    
50 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
52     sub START_TAG_TOKEN () { 3 }
53     sub END_TAG_TOKEN () { 4 }
54     sub END_OF_FILE_TOKEN () { 5 }
55     sub CHARACTER_TOKEN () { 6 }
56 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 wakaba 1.14 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59     sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60     sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61     sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62     sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63     sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64 wakaba 1.12
65     ## XML5: XML5 has "empty tag token". In this implementation, it is
66     ## represented as a start tag token with $self->{self_closing} flag
67     ## set to true.
68    
69     ## XML5: XML5 has "short end tag token". In this implementation, it
70     ## is represented as an end tag token with $token->{tag_name} flag set
71     ## to an empty string.
72 wakaba 1.1
73     package Whatpm::HTML;
74    
75 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77 wakaba 1.1 ## Content model flags
78    
79     sub CM_ENTITY () { 0b001 } # & markup in data
80     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82    
83     sub PLAINTEXT_CONTENT_MODEL () { 0 }
84     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87    
88     ## Tokenizer states
89    
90     sub DATA_STATE () { 0 }
91     #sub ENTITY_DATA_STATE () { 1 }
92     sub TAG_OPEN_STATE () { 2 }
93     sub CLOSE_TAG_OPEN_STATE () { 3 }
94     sub TAG_NAME_STATE () { 4 }
95     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96     sub ATTRIBUTE_NAME_STATE () { 6 }
97     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104     sub COMMENT_START_STATE () { 14 }
105     sub COMMENT_START_DASH_STATE () { 15 }
106     sub COMMENT_STATE () { 16 }
107     sub COMMENT_END_STATE () { 17 }
108     sub COMMENT_END_DASH_STATE () { 18 }
109     sub BOGUS_COMMENT_STATE () { 19 }
110     sub DOCTYPE_STATE () { 20 }
111     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112     sub DOCTYPE_NAME_STATE () { 22 }
113     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122     sub BOGUS_DOCTYPE_STATE () { 32 }
123     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124     sub SELF_CLOSING_START_TAG_STATE () { 34 }
125     sub CDATA_SECTION_STATE () { 35 }
126     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134     ## NOTE: "Entity data state", "entity in attribute value state", and
135     ## "consume a character reference" algorithm are jointly implemented
136     ## using the following six states:
137     sub ENTITY_STATE () { 44 }
138     sub ENTITY_HASH_STATE () { 45 }
139     sub NCR_NUM_STATE () { 46 }
140     sub HEXREF_X_STATE () { 47 }
141     sub HEXREF_HEX_STATE () { 48 }
142     sub ENTITY_NAME_STATE () { 49 }
143     sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145 wakaba 1.12 ## XML-only states
146 wakaba 1.8 sub PI_STATE () { 51 }
147     sub PI_TARGET_STATE () { 52 }
148     sub PI_TARGET_AFTER_STATE () { 53 }
149     sub PI_DATA_STATE () { 54 }
150     sub PI_AFTER_STATE () { 55 }
151     sub PI_DATA_AFTER_STATE () { 56 }
152 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 wakaba 1.14 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155     sub DOCTYPE_TAG_STATE () { 60 }
156     sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157     sub MD_ATTLIST_STATE () { 62 }
158     sub MD_E_STATE () { 63 }
159     sub MD_ELEMENT_STATE () { 64 }
160     sub MD_ENTITY_STATE () { 65 }
161     sub MD_NOTATION_STATE () { 66 }
162     sub DOCTYPE_MD_STATE () { 67 }
163     sub BEFORE_MD_NAME_STATE () { 68 }
164     sub MD_NAME_STATE () { 69 }
165     sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166     sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 wakaba 1.15 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168     sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170     sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171     sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172     sub ALLOWED_TOKEN_STATE () { 77 }
173     sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174     sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175     sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178     sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179     sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 wakaba 1.18 sub BEFORE_NDATA_STATE () { 85 }
181     sub NDATA_STATE () { 86 }
182     sub AFTER_NDATA_STATE () { 87 }
183     sub BEFORE_NOTATION_NAME_STATE () { 88 }
184     sub NOTATION_NAME_STATE () { 89 }
185     sub AFTER_NOTATION_NAME_STATE () { 90 }
186 wakaba 1.19 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187     sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188     sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189     sub BOGUS_MD_STATE () { 94 }
190 wakaba 1.8
191 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
192     ## list and descriptions)
193    
194     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
195     sub FOREIGN_EL () { 0b1_00000000000 }
196    
197     ## Character reference mappings
198    
199     my $charref_map = {
200     0x0D => 0x000A,
201     0x80 => 0x20AC,
202     0x81 => 0xFFFD,
203     0x82 => 0x201A,
204     0x83 => 0x0192,
205     0x84 => 0x201E,
206     0x85 => 0x2026,
207     0x86 => 0x2020,
208     0x87 => 0x2021,
209     0x88 => 0x02C6,
210     0x89 => 0x2030,
211     0x8A => 0x0160,
212     0x8B => 0x2039,
213     0x8C => 0x0152,
214     0x8D => 0xFFFD,
215     0x8E => 0x017D,
216     0x8F => 0xFFFD,
217     0x90 => 0xFFFD,
218     0x91 => 0x2018,
219     0x92 => 0x2019,
220     0x93 => 0x201C,
221     0x94 => 0x201D,
222     0x95 => 0x2022,
223     0x96 => 0x2013,
224     0x97 => 0x2014,
225     0x98 => 0x02DC,
226     0x99 => 0x2122,
227     0x9A => 0x0161,
228     0x9B => 0x203A,
229     0x9C => 0x0153,
230     0x9D => 0xFFFD,
231     0x9E => 0x017E,
232     0x9F => 0x0178,
233     }; # $charref_map
234     $charref_map->{$_} = 0xFFFD
235     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
236     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
237     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
238     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
239     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
240     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
241     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
242    
243     ## Implementations MUST act as if state machine in the spec
244    
245     sub _initialize_tokenizer ($) {
246     my $self = shift;
247    
248     ## NOTE: Fields set by |new| constructor:
249     #$self->{level}
250     #$self->{set_nc}
251     #$self->{parse_error}
252 wakaba 1.3 #$self->{is_xml} (if XML)
253 wakaba 1.1
254     $self->{state} = DATA_STATE; # MUST
255 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
256     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
257 wakaba 1.1 #$self->{entity__value}; # initialized when used
258     #$self->{entity__match}; # initialized when used
259     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
260     undef $self->{ct}; # current token
261     undef $self->{ca}; # current attribute
262     undef $self->{last_stag_name}; # last emitted start tag name
263     #$self->{prev_state}; # initialized when used
264     delete $self->{self_closing};
265     $self->{char_buffer} = '';
266     $self->{char_buffer_pos} = 0;
267     $self->{nc} = -1; # next input character
268     #$self->{next_nc}
269    
270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
271     $self->{line_prev} = $self->{line};
272     $self->{column_prev} = $self->{column};
273     $self->{column}++;
274     $self->{nc}
275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
276     } else {
277     $self->{set_nc}->($self);
278     }
279    
280     $self->{token} = [];
281     # $self->{escape}
282     } # _initialize_tokenizer
283    
284     ## A token has:
285     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
286 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
287 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
288     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
289 wakaba 1.11 ## ->{target} (PI_TOKEN)
290 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
291     ## ->{sysid} (DOCTYPE_TOKEN)
292     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
293     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
294     ## ->{name}
295     ## ->{value}
296     ## ->{has_reference} == 1 or 0
297 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
298     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
299 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
300 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
301 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
302    
303 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
304     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
305     ## while the token is pushed back to the stack.
306    
307     ## Emitted token MUST immediately be handled by the tree construction state.
308    
309     ## Before each step, UA MAY check to see if either one of the scripts in
310     ## "list of scripts that will execute as soon as possible" or the first
311     ## script in the "list of scripts that will execute asynchronously",
312     ## has completed loading. If one has, then it MUST be executed
313     ## and removed from the list.
314    
315     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
316     ## (This requirement was dropped from HTML5 spec, unfortunately.)
317    
318     my $is_space = {
319     0x0009 => 1, # CHARACTER TABULATION (HT)
320     0x000A => 1, # LINE FEED (LF)
321     #0x000B => 0, # LINE TABULATION (VT)
322 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
323 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
324     0x0020 => 1, # SPACE (SP)
325     };
326    
327     sub _get_next_token ($) {
328     my $self = shift;
329    
330     if ($self->{self_closing}) {
331     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
332     ## NOTE: The |self_closing| flag is only set by start tag token.
333     ## In addition, when a start tag token is emitted, it is always set to
334     ## |ct|.
335     delete $self->{self_closing};
336     }
337    
338     if (@{$self->{token}}) {
339     $self->{self_closing} = $self->{token}->[0]->{self_closing};
340     return shift @{$self->{token}};
341     }
342    
343     A: {
344     if ($self->{state} == PCDATA_STATE) {
345     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
346    
347     if ($self->{nc} == 0x0026) { # &
348    
349     ## NOTE: In the spec, the tokenizer is switched to the
350     ## "entity data state". In this implementation, the tokenizer
351     ## is switched to the |ENTITY_STATE|, which is an implementation
352     ## of the "consume a character reference" algorithm.
353     $self->{entity_add} = -1;
354     $self->{prev_state} = DATA_STATE;
355     $self->{state} = ENTITY_STATE;
356    
357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
358     $self->{line_prev} = $self->{line};
359     $self->{column_prev} = $self->{column};
360     $self->{column}++;
361     $self->{nc}
362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
363     } else {
364     $self->{set_nc}->($self);
365     }
366    
367     redo A;
368     } elsif ($self->{nc} == 0x003C) { # <
369    
370     $self->{state} = TAG_OPEN_STATE;
371    
372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
373     $self->{line_prev} = $self->{line};
374     $self->{column_prev} = $self->{column};
375     $self->{column}++;
376     $self->{nc}
377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
378     } else {
379     $self->{set_nc}->($self);
380     }
381    
382     redo A;
383     } elsif ($self->{nc} == -1) {
384    
385     return ({type => END_OF_FILE_TOKEN,
386     line => $self->{line}, column => $self->{column}});
387     last A; ## TODO: ok?
388     } else {
389    
390     #
391     }
392    
393     # Anything else
394     my $token = {type => CHARACTER_TOKEN,
395     data => chr $self->{nc},
396     line => $self->{line}, column => $self->{column},
397     };
398     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
399    
400     ## Stay in the state.
401    
402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
403     $self->{line_prev} = $self->{line};
404     $self->{column_prev} = $self->{column};
405     $self->{column}++;
406     $self->{nc}
407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
408     } else {
409     $self->{set_nc}->($self);
410     }
411    
412     return ($token);
413     redo A;
414     } elsif ($self->{state} == DATA_STATE) {
415     $self->{s_kwd} = '' unless defined $self->{s_kwd};
416     if ($self->{nc} == 0x0026) { # &
417     $self->{s_kwd} = '';
418     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
419     not $self->{escape}) {
420    
421     ## NOTE: In the spec, the tokenizer is switched to the
422     ## "entity data state". In this implementation, the tokenizer
423     ## is switched to the |ENTITY_STATE|, which is an implementation
424     ## of the "consume a character reference" algorithm.
425     $self->{entity_add} = -1;
426     $self->{prev_state} = DATA_STATE;
427     $self->{state} = ENTITY_STATE;
428    
429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
430     $self->{line_prev} = $self->{line};
431     $self->{column_prev} = $self->{column};
432     $self->{column}++;
433     $self->{nc}
434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
435     } else {
436     $self->{set_nc}->($self);
437     }
438    
439     redo A;
440     } else {
441    
442     #
443     }
444     } elsif ($self->{nc} == 0x002D) { # -
445     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
447 wakaba 1.1
448     $self->{escape} = 1; # unless $self->{escape};
449     $self->{s_kwd} = '--';
450     #
451 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
452 wakaba 1.1
453     $self->{s_kwd} = '--';
454     #
455 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
456    
457     $self->{s_kwd} .= '-';
458     #
459 wakaba 1.1 } else {
460    
461 wakaba 1.5 $self->{s_kwd} = '-';
462 wakaba 1.1 #
463     }
464     }
465    
466     #
467     } elsif ($self->{nc} == 0x0021) { # !
468     if (length $self->{s_kwd}) {
469    
470     $self->{s_kwd} .= '!';
471     #
472     } else {
473    
474     #$self->{s_kwd} = '';
475     #
476     }
477     #
478     } elsif ($self->{nc} == 0x003C) { # <
479     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
480     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
481     not $self->{escape})) {
482    
483     $self->{state} = TAG_OPEN_STATE;
484    
485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
486     $self->{line_prev} = $self->{line};
487     $self->{column_prev} = $self->{column};
488     $self->{column}++;
489     $self->{nc}
490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
491     } else {
492     $self->{set_nc}->($self);
493     }
494    
495     redo A;
496     } else {
497    
498     $self->{s_kwd} = '';
499     #
500     }
501     } elsif ($self->{nc} == 0x003E) { # >
502     if ($self->{escape} and
503     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
504     if ($self->{s_kwd} eq '--') {
505    
506     delete $self->{escape};
507 wakaba 1.5 #
508 wakaba 1.1 } else {
509    
510 wakaba 1.5 #
511 wakaba 1.1 }
512 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
513    
514     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
515     line => $self->{line_prev},
516     column => $self->{column_prev} - 1);
517     #
518 wakaba 1.1 } else {
519    
520 wakaba 1.5 #
521 wakaba 1.1 }
522    
523     $self->{s_kwd} = '';
524     #
525 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
526     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
527    
528     $self->{s_kwd} .= ']';
529     } elsif ($self->{s_kwd} eq ']]') {
530    
531     #
532     } else {
533    
534     $self->{s_kwd} = '';
535     }
536     #
537 wakaba 1.1 } elsif ($self->{nc} == -1) {
538    
539     $self->{s_kwd} = '';
540     return ({type => END_OF_FILE_TOKEN,
541     line => $self->{line}, column => $self->{column}});
542     last A; ## TODO: ok?
543     } else {
544    
545     $self->{s_kwd} = '';
546     #
547     }
548    
549     # Anything else
550     my $token = {type => CHARACTER_TOKEN,
551     data => chr $self->{nc},
552     line => $self->{line}, column => $self->{column},
553     };
554 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
555 wakaba 1.1 length $token->{data})) {
556     $self->{s_kwd} = '';
557     }
558    
559     ## Stay in the data state.
560 wakaba 1.5 if (not $self->{is_xml} and
561     $self->{content_model} == PCDATA_CONTENT_MODEL) {
562 wakaba 1.1
563     $self->{state} = PCDATA_STATE;
564     } else {
565    
566     ## Stay in the state.
567     }
568    
569     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
570     $self->{line_prev} = $self->{line};
571     $self->{column_prev} = $self->{column};
572     $self->{column}++;
573     $self->{nc}
574     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
575     } else {
576     $self->{set_nc}->($self);
577     }
578    
579     return ($token);
580     redo A;
581     } elsif ($self->{state} == TAG_OPEN_STATE) {
582 wakaba 1.10 ## XML5: "tag state".
583    
584 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
585     if ($self->{nc} == 0x002F) { # /
586    
587    
588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
589     $self->{line_prev} = $self->{line};
590     $self->{column_prev} = $self->{column};
591     $self->{column}++;
592     $self->{nc}
593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
594     } else {
595     $self->{set_nc}->($self);
596     }
597    
598     $self->{state} = CLOSE_TAG_OPEN_STATE;
599     redo A;
600     } elsif ($self->{nc} == 0x0021) { # !
601    
602 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
603 wakaba 1.1 #
604     } else {
605    
606 wakaba 1.12 $self->{s_kwd} = '';
607 wakaba 1.1 #
608     }
609    
610     ## reconsume
611     $self->{state} = DATA_STATE;
612     return ({type => CHARACTER_TOKEN, data => '<',
613     line => $self->{line_prev},
614     column => $self->{column_prev},
615     });
616     redo A;
617     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
618     if ($self->{nc} == 0x0021) { # !
619    
620     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
621    
622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
623     $self->{line_prev} = $self->{line};
624     $self->{column_prev} = $self->{column};
625     $self->{column}++;
626     $self->{nc}
627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
628     } else {
629     $self->{set_nc}->($self);
630     }
631    
632     redo A;
633     } elsif ($self->{nc} == 0x002F) { # /
634    
635     $self->{state} = CLOSE_TAG_OPEN_STATE;
636    
637     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
638     $self->{line_prev} = $self->{line};
639     $self->{column_prev} = $self->{column};
640     $self->{column}++;
641     $self->{nc}
642     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
643     } else {
644     $self->{set_nc}->($self);
645     }
646    
647     redo A;
648     } elsif (0x0041 <= $self->{nc} and
649     $self->{nc} <= 0x005A) { # A..Z
650    
651     $self->{ct}
652     = {type => START_TAG_TOKEN,
653 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
654 wakaba 1.1 line => $self->{line_prev},
655     column => $self->{column_prev}};
656     $self->{state} = TAG_NAME_STATE;
657    
658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
659     $self->{line_prev} = $self->{line};
660     $self->{column_prev} = $self->{column};
661     $self->{column}++;
662     $self->{nc}
663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
664     } else {
665     $self->{set_nc}->($self);
666     }
667    
668     redo A;
669     } elsif (0x0061 <= $self->{nc} and
670     $self->{nc} <= 0x007A) { # a..z
671    
672     $self->{ct} = {type => START_TAG_TOKEN,
673     tag_name => chr ($self->{nc}),
674     line => $self->{line_prev},
675     column => $self->{column_prev}};
676     $self->{state} = TAG_NAME_STATE;
677    
678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
679     $self->{line_prev} = $self->{line};
680     $self->{column_prev} = $self->{column};
681     $self->{column}++;
682     $self->{nc}
683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
684     } else {
685     $self->{set_nc}->($self);
686     }
687    
688     redo A;
689     } elsif ($self->{nc} == 0x003E) { # >
690    
691     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
692     line => $self->{line_prev},
693     column => $self->{column_prev});
694     $self->{state} = DATA_STATE;
695 wakaba 1.5 $self->{s_kwd} = '';
696 wakaba 1.1
697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
698     $self->{line_prev} = $self->{line};
699     $self->{column_prev} = $self->{column};
700     $self->{column}++;
701     $self->{nc}
702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
703     } else {
704     $self->{set_nc}->($self);
705     }
706    
707    
708     return ({type => CHARACTER_TOKEN, data => '<>',
709     line => $self->{line_prev},
710     column => $self->{column_prev},
711     });
712    
713     redo A;
714     } elsif ($self->{nc} == 0x003F) { # ?
715 wakaba 1.8 if ($self->{is_xml}) {
716    
717     $self->{state} = PI_STATE;
718    
719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
720     $self->{line_prev} = $self->{line};
721     $self->{column_prev} = $self->{column};
722     $self->{column}++;
723     $self->{nc}
724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
725     } else {
726     $self->{set_nc}->($self);
727     }
728    
729     redo A;
730     } else {
731    
732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
733     line => $self->{line_prev},
734     column => $self->{column_prev});
735     $self->{state} = BOGUS_COMMENT_STATE;
736     $self->{ct} = {type => COMMENT_TOKEN, data => '',
737     line => $self->{line_prev},
738     column => $self->{column_prev},
739     };
740     ## $self->{nc} is intentionally left as is
741     redo A;
742     }
743 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
744 wakaba 1.1
745     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
746     line => $self->{line_prev},
747     column => $self->{column_prev});
748     $self->{state} = DATA_STATE;
749 wakaba 1.5 $self->{s_kwd} = '';
750 wakaba 1.1 ## reconsume
751    
752     return ({type => CHARACTER_TOKEN, data => '<',
753     line => $self->{line_prev},
754     column => $self->{column_prev},
755     });
756    
757     redo A;
758 wakaba 1.9 } else {
759     ## XML5: "<:" is a parse error.
760    
761     $self->{ct} = {type => START_TAG_TOKEN,
762     tag_name => chr ($self->{nc}),
763     line => $self->{line_prev},
764     column => $self->{column_prev}};
765     $self->{state} = TAG_NAME_STATE;
766    
767     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
768     $self->{line_prev} = $self->{line};
769     $self->{column_prev} = $self->{column};
770     $self->{column}++;
771     $self->{nc}
772     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
773     } else {
774     $self->{set_nc}->($self);
775     }
776    
777     redo A;
778 wakaba 1.1 }
779     } else {
780     die "$0: $self->{content_model} in tag open";
781     }
782     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
783     ## NOTE: The "close tag open state" in the spec is implemented as
784     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
785    
786 wakaba 1.10 ## XML5: "end tag state".
787    
788 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
789     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
790     if (defined $self->{last_stag_name}) {
791     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
792 wakaba 1.12 $self->{kwd} = '';
793 wakaba 1.1 ## Reconsume.
794     redo A;
795     } else {
796     ## No start tag token has ever been emitted
797     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
798    
799     $self->{state} = DATA_STATE;
800 wakaba 1.5 $self->{s_kwd} = '';
801 wakaba 1.1 ## Reconsume.
802     return ({type => CHARACTER_TOKEN, data => '</',
803     line => $l, column => $c,
804     });
805     redo A;
806     }
807     }
808    
809     if (0x0041 <= $self->{nc} and
810     $self->{nc} <= 0x005A) { # A..Z
811    
812     $self->{ct}
813     = {type => END_TAG_TOKEN,
814 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
815 wakaba 1.1 line => $l, column => $c};
816     $self->{state} = TAG_NAME_STATE;
817    
818     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
819     $self->{line_prev} = $self->{line};
820     $self->{column_prev} = $self->{column};
821     $self->{column}++;
822     $self->{nc}
823     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
824     } else {
825     $self->{set_nc}->($self);
826     }
827    
828     redo A;
829     } elsif (0x0061 <= $self->{nc} and
830     $self->{nc} <= 0x007A) { # a..z
831    
832     $self->{ct} = {type => END_TAG_TOKEN,
833     tag_name => chr ($self->{nc}),
834     line => $l, column => $c};
835     $self->{state} = TAG_NAME_STATE;
836    
837     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
838     $self->{line_prev} = $self->{line};
839     $self->{column_prev} = $self->{column};
840     $self->{column}++;
841     $self->{nc}
842     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
843     } else {
844     $self->{set_nc}->($self);
845     }
846    
847     redo A;
848     } elsif ($self->{nc} == 0x003E) { # >
849     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
850     line => $self->{line_prev}, ## "<" in "</>"
851     column => $self->{column_prev} - 1);
852     $self->{state} = DATA_STATE;
853 wakaba 1.5 $self->{s_kwd} = '';
854 wakaba 1.10 if ($self->{is_xml}) {
855    
856     ## XML5: No parse error.
857    
858     ## NOTE: This parser raises a parse error, since it supports
859     ## XML1, not XML5.
860    
861     ## NOTE: A short end tag token.
862     my $ct = {type => END_TAG_TOKEN,
863     tag_name => '',
864     line => $self->{line_prev},
865     column => $self->{column_prev} - 1,
866     };
867    
868     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
869     $self->{line_prev} = $self->{line};
870     $self->{column_prev} = $self->{column};
871     $self->{column}++;
872     $self->{nc}
873     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
874     } else {
875     $self->{set_nc}->($self);
876     }
877    
878     return ($ct);
879     } else {
880    
881    
882 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
883     $self->{line_prev} = $self->{line};
884     $self->{column_prev} = $self->{column};
885     $self->{column}++;
886     $self->{nc}
887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
888     } else {
889     $self->{set_nc}->($self);
890     }
891    
892 wakaba 1.10 }
893 wakaba 1.1 redo A;
894     } elsif ($self->{nc} == -1) {
895    
896     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
897 wakaba 1.5 $self->{s_kwd} = '';
898 wakaba 1.1 $self->{state} = DATA_STATE;
899     # reconsume
900    
901     return ({type => CHARACTER_TOKEN, data => '</',
902     line => $l, column => $c,
903     });
904    
905     redo A;
906 wakaba 1.10 } elsif (not $self->{is_xml} or
907     $is_space->{$self->{nc}}) {
908 wakaba 1.1
909 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
910     line => $self->{line_prev}, # "<" of "</"
911     column => $self->{column_prev} - 1);
912 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
913     $self->{ct} = {type => COMMENT_TOKEN, data => '',
914     line => $self->{line_prev}, # "<" of "</"
915     column => $self->{column_prev} - 1,
916     };
917     ## NOTE: $self->{nc} is intentionally left as is.
918     ## Although the "anything else" case of the spec not explicitly
919     ## states that the next input character is to be reconsumed,
920     ## it will be included to the |data| of the comment token
921     ## generated from the bogus end tag, as defined in the
922     ## "bogus comment state" entry.
923     redo A;
924 wakaba 1.10 } else {
925     ## XML5: "</:" is a parse error.
926    
927     $self->{ct} = {type => END_TAG_TOKEN,
928     tag_name => chr ($self->{nc}),
929     line => $l, column => $c};
930     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
931    
932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933     $self->{line_prev} = $self->{line};
934     $self->{column_prev} = $self->{column};
935     $self->{column}++;
936     $self->{nc}
937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938     } else {
939     $self->{set_nc}->($self);
940     }
941    
942     redo A;
943 wakaba 1.1 }
944     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
945 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
946 wakaba 1.1 if (length $ch) {
947     my $CH = $ch;
948     $ch =~ tr/a-z/A-Z/;
949     my $nch = chr $self->{nc};
950     if ($nch eq $ch or $nch eq $CH) {
951    
952     ## Stay in the state.
953 wakaba 1.12 $self->{kwd} .= $nch;
954 wakaba 1.1
955     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
956     $self->{line_prev} = $self->{line};
957     $self->{column_prev} = $self->{column};
958     $self->{column}++;
959     $self->{nc}
960     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
961     } else {
962     $self->{set_nc}->($self);
963     }
964    
965     redo A;
966     } else {
967    
968     $self->{state} = DATA_STATE;
969 wakaba 1.5 $self->{s_kwd} = '';
970 wakaba 1.1 ## Reconsume.
971     return ({type => CHARACTER_TOKEN,
972 wakaba 1.12 data => '</' . $self->{kwd},
973 wakaba 1.1 line => $self->{line_prev},
974 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
975 wakaba 1.1 });
976     redo A;
977     }
978     } else { # after "<{tag-name}"
979     unless ($is_space->{$self->{nc}} or
980     {
981     0x003E => 1, # >
982     0x002F => 1, # /
983     -1 => 1, # EOF
984     }->{$self->{nc}}) {
985    
986     ## Reconsume.
987     $self->{state} = DATA_STATE;
988 wakaba 1.5 $self->{s_kwd} = '';
989 wakaba 1.1 return ({type => CHARACTER_TOKEN,
990 wakaba 1.12 data => '</' . $self->{kwd},
991 wakaba 1.1 line => $self->{line_prev},
992 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
993 wakaba 1.1 });
994     redo A;
995     } else {
996    
997     $self->{ct}
998     = {type => END_TAG_TOKEN,
999     tag_name => $self->{last_stag_name},
1000     line => $self->{line_prev},
1001 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
1002 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
1003     ## Reconsume.
1004     redo A;
1005     }
1006     }
1007     } elsif ($self->{state} == TAG_NAME_STATE) {
1008     if ($is_space->{$self->{nc}}) {
1009    
1010     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1011    
1012     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1013     $self->{line_prev} = $self->{line};
1014     $self->{column_prev} = $self->{column};
1015     $self->{column}++;
1016     $self->{nc}
1017     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1018     } else {
1019     $self->{set_nc}->($self);
1020     }
1021    
1022     redo A;
1023     } elsif ($self->{nc} == 0x003E) { # >
1024     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1025    
1026     $self->{last_stag_name} = $self->{ct}->{tag_name};
1027     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1028     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1029     #if ($self->{ct}->{attributes}) {
1030     # ## NOTE: This should never be reached.
1031     # !!! cp (36);
1032     # !!! parse-error (type => 'end tag attribute');
1033     #} else {
1034    
1035     #}
1036     } else {
1037     die "$0: $self->{ct}->{type}: Unknown token type";
1038     }
1039     $self->{state} = DATA_STATE;
1040 wakaba 1.5 $self->{s_kwd} = '';
1041 wakaba 1.1
1042     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1043     $self->{line_prev} = $self->{line};
1044     $self->{column_prev} = $self->{column};
1045     $self->{column}++;
1046     $self->{nc}
1047     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1048     } else {
1049     $self->{set_nc}->($self);
1050     }
1051    
1052    
1053     return ($self->{ct}); # start tag or end tag
1054    
1055     redo A;
1056     } elsif (0x0041 <= $self->{nc} and
1057     $self->{nc} <= 0x005A) { # A..Z
1058    
1059 wakaba 1.4 $self->{ct}->{tag_name}
1060     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1061 wakaba 1.1 # start tag or end tag
1062     ## Stay in this state
1063    
1064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1065     $self->{line_prev} = $self->{line};
1066     $self->{column_prev} = $self->{column};
1067     $self->{column}++;
1068     $self->{nc}
1069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1070     } else {
1071     $self->{set_nc}->($self);
1072     }
1073    
1074     redo A;
1075     } elsif ($self->{nc} == -1) {
1076     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1077     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1078    
1079     $self->{last_stag_name} = $self->{ct}->{tag_name};
1080     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1081     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1082     #if ($self->{ct}->{attributes}) {
1083     # ## NOTE: This state should never be reached.
1084     # !!! cp (40);
1085     # !!! parse-error (type => 'end tag attribute');
1086     #} else {
1087    
1088     #}
1089     } else {
1090     die "$0: $self->{ct}->{type}: Unknown token type";
1091     }
1092     $self->{state} = DATA_STATE;
1093 wakaba 1.5 $self->{s_kwd} = '';
1094 wakaba 1.1 # reconsume
1095    
1096     return ($self->{ct}); # start tag or end tag
1097    
1098     redo A;
1099     } elsif ($self->{nc} == 0x002F) { # /
1100    
1101     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1102    
1103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1104     $self->{line_prev} = $self->{line};
1105     $self->{column_prev} = $self->{column};
1106     $self->{column}++;
1107     $self->{nc}
1108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1109     } else {
1110     $self->{set_nc}->($self);
1111     }
1112    
1113     redo A;
1114     } else {
1115    
1116     $self->{ct}->{tag_name} .= chr $self->{nc};
1117     # start tag or end tag
1118     ## Stay in the state
1119    
1120     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1121     $self->{line_prev} = $self->{line};
1122     $self->{column_prev} = $self->{column};
1123     $self->{column}++;
1124     $self->{nc}
1125     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1126     } else {
1127     $self->{set_nc}->($self);
1128     }
1129    
1130     redo A;
1131     }
1132     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1133 wakaba 1.11 ## XML5: "Tag attribute name before state".
1134    
1135 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1136    
1137     ## Stay in the state
1138    
1139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1140     $self->{line_prev} = $self->{line};
1141     $self->{column_prev} = $self->{column};
1142     $self->{column}++;
1143     $self->{nc}
1144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1145     } else {
1146     $self->{set_nc}->($self);
1147     }
1148    
1149     redo A;
1150     } elsif ($self->{nc} == 0x003E) { # >
1151     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1152    
1153     $self->{last_stag_name} = $self->{ct}->{tag_name};
1154     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1155     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1156     if ($self->{ct}->{attributes}) {
1157    
1158     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1159     } else {
1160    
1161     }
1162     } else {
1163     die "$0: $self->{ct}->{type}: Unknown token type";
1164     }
1165     $self->{state} = DATA_STATE;
1166 wakaba 1.5 $self->{s_kwd} = '';
1167 wakaba 1.1
1168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1169     $self->{line_prev} = $self->{line};
1170     $self->{column_prev} = $self->{column};
1171     $self->{column}++;
1172     $self->{nc}
1173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1174     } else {
1175     $self->{set_nc}->($self);
1176     }
1177    
1178    
1179     return ($self->{ct}); # start tag or end tag
1180    
1181     redo A;
1182     } elsif (0x0041 <= $self->{nc} and
1183     $self->{nc} <= 0x005A) { # A..Z
1184    
1185     $self->{ca}
1186 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1187 wakaba 1.1 value => '',
1188     line => $self->{line}, column => $self->{column}};
1189     $self->{state} = ATTRIBUTE_NAME_STATE;
1190    
1191     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1192     $self->{line_prev} = $self->{line};
1193     $self->{column_prev} = $self->{column};
1194     $self->{column}++;
1195     $self->{nc}
1196     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1197     } else {
1198     $self->{set_nc}->($self);
1199     }
1200    
1201     redo A;
1202     } elsif ($self->{nc} == 0x002F) { # /
1203    
1204     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1205    
1206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1207     $self->{line_prev} = $self->{line};
1208     $self->{column_prev} = $self->{column};
1209     $self->{column}++;
1210     $self->{nc}
1211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1212     } else {
1213     $self->{set_nc}->($self);
1214     }
1215    
1216     redo A;
1217     } elsif ($self->{nc} == -1) {
1218     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1219     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1220    
1221     $self->{last_stag_name} = $self->{ct}->{tag_name};
1222     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1223     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1224     if ($self->{ct}->{attributes}) {
1225    
1226     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1227     } else {
1228    
1229     }
1230     } else {
1231     die "$0: $self->{ct}->{type}: Unknown token type";
1232     }
1233     $self->{state} = DATA_STATE;
1234 wakaba 1.5 $self->{s_kwd} = '';
1235 wakaba 1.1 # reconsume
1236    
1237     return ($self->{ct}); # start tag or end tag
1238    
1239     redo A;
1240     } else {
1241     if ({
1242     0x0022 => 1, # "
1243     0x0027 => 1, # '
1244     0x003D => 1, # =
1245     }->{$self->{nc}}) {
1246    
1247 wakaba 1.11 ## XML5: Not a parse error.
1248 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1249     } else {
1250    
1251 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
1252 wakaba 1.1 }
1253     $self->{ca}
1254     = {name => chr ($self->{nc}),
1255     value => '',
1256     line => $self->{line}, column => $self->{column}};
1257     $self->{state} = ATTRIBUTE_NAME_STATE;
1258    
1259     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1260     $self->{line_prev} = $self->{line};
1261     $self->{column_prev} = $self->{column};
1262     $self->{column}++;
1263     $self->{nc}
1264     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1265     } else {
1266     $self->{set_nc}->($self);
1267     }
1268    
1269     redo A;
1270     }
1271     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1272 wakaba 1.11 ## XML5: "Tag attribute name state".
1273    
1274 wakaba 1.1 my $before_leave = sub {
1275     if (exists $self->{ct}->{attributes} # start tag or end tag
1276     ->{$self->{ca}->{name}}) { # MUST
1277    
1278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1279     ## Discard $self->{ca} # MUST
1280     } else {
1281    
1282     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1283     = $self->{ca};
1284 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1285 wakaba 1.1 }
1286     }; # $before_leave
1287    
1288     if ($is_space->{$self->{nc}}) {
1289    
1290     $before_leave->();
1291     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1292    
1293     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1294     $self->{line_prev} = $self->{line};
1295     $self->{column_prev} = $self->{column};
1296     $self->{column}++;
1297     $self->{nc}
1298     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1299     } else {
1300     $self->{set_nc}->($self);
1301     }
1302    
1303     redo A;
1304     } elsif ($self->{nc} == 0x003D) { # =
1305    
1306     $before_leave->();
1307     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1308    
1309     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1310     $self->{line_prev} = $self->{line};
1311     $self->{column_prev} = $self->{column};
1312     $self->{column}++;
1313     $self->{nc}
1314     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1315     } else {
1316     $self->{set_nc}->($self);
1317     }
1318    
1319     redo A;
1320     } elsif ($self->{nc} == 0x003E) { # >
1321 wakaba 1.11 if ($self->{is_xml}) {
1322    
1323     ## XML5: Not a parse error.
1324     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1325     } else {
1326    
1327     }
1328    
1329 wakaba 1.1 $before_leave->();
1330     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1331    
1332     $self->{last_stag_name} = $self->{ct}->{tag_name};
1333     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1334    
1335     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1336     if ($self->{ct}->{attributes}) {
1337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1338     }
1339     } else {
1340     die "$0: $self->{ct}->{type}: Unknown token type";
1341     }
1342     $self->{state} = DATA_STATE;
1343 wakaba 1.5 $self->{s_kwd} = '';
1344 wakaba 1.1
1345     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1346     $self->{line_prev} = $self->{line};
1347     $self->{column_prev} = $self->{column};
1348     $self->{column}++;
1349     $self->{nc}
1350     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1351     } else {
1352     $self->{set_nc}->($self);
1353     }
1354    
1355    
1356     return ($self->{ct}); # start tag or end tag
1357    
1358     redo A;
1359     } elsif (0x0041 <= $self->{nc} and
1360     $self->{nc} <= 0x005A) { # A..Z
1361    
1362 wakaba 1.4 $self->{ca}->{name}
1363     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1364 wakaba 1.1 ## Stay in the state
1365    
1366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1367     $self->{line_prev} = $self->{line};
1368     $self->{column_prev} = $self->{column};
1369     $self->{column}++;
1370     $self->{nc}
1371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1372     } else {
1373     $self->{set_nc}->($self);
1374     }
1375    
1376     redo A;
1377     } elsif ($self->{nc} == 0x002F) { # /
1378 wakaba 1.11 if ($self->{is_xml}) {
1379    
1380     ## XML5: Not a parse error.
1381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1382     } else {
1383    
1384     }
1385 wakaba 1.1
1386     $before_leave->();
1387     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1388    
1389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1390     $self->{line_prev} = $self->{line};
1391     $self->{column_prev} = $self->{column};
1392     $self->{column}++;
1393     $self->{nc}
1394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1395     } else {
1396     $self->{set_nc}->($self);
1397     }
1398    
1399     redo A;
1400     } elsif ($self->{nc} == -1) {
1401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1402     $before_leave->();
1403     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1404    
1405     $self->{last_stag_name} = $self->{ct}->{tag_name};
1406     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1407     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1408     if ($self->{ct}->{attributes}) {
1409    
1410     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1411     } else {
1412     ## NOTE: This state should never be reached.
1413    
1414     }
1415     } else {
1416     die "$0: $self->{ct}->{type}: Unknown token type";
1417     }
1418     $self->{state} = DATA_STATE;
1419 wakaba 1.5 $self->{s_kwd} = '';
1420 wakaba 1.1 # reconsume
1421    
1422     return ($self->{ct}); # start tag or end tag
1423    
1424     redo A;
1425     } else {
1426     if ($self->{nc} == 0x0022 or # "
1427     $self->{nc} == 0x0027) { # '
1428    
1429 wakaba 1.11 ## XML5: Not a parse error.
1430 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1431     } else {
1432    
1433     }
1434     $self->{ca}->{name} .= chr ($self->{nc});
1435     ## Stay in the state
1436    
1437     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1438     $self->{line_prev} = $self->{line};
1439     $self->{column_prev} = $self->{column};
1440     $self->{column}++;
1441     $self->{nc}
1442     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1443     } else {
1444     $self->{set_nc}->($self);
1445     }
1446    
1447     redo A;
1448     }
1449     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1450 wakaba 1.11 ## XML5: "Tag attribute name after state".
1451    
1452 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1453    
1454     ## Stay in the state
1455    
1456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1457     $self->{line_prev} = $self->{line};
1458     $self->{column_prev} = $self->{column};
1459     $self->{column}++;
1460     $self->{nc}
1461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1462     } else {
1463     $self->{set_nc}->($self);
1464     }
1465    
1466     redo A;
1467     } elsif ($self->{nc} == 0x003D) { # =
1468    
1469     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1470    
1471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1472     $self->{line_prev} = $self->{line};
1473     $self->{column_prev} = $self->{column};
1474     $self->{column}++;
1475     $self->{nc}
1476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1477     } else {
1478     $self->{set_nc}->($self);
1479     }
1480    
1481     redo A;
1482     } elsif ($self->{nc} == 0x003E) { # >
1483 wakaba 1.11 if ($self->{is_xml}) {
1484    
1485     ## XML5: Not a parse error.
1486     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1487     } else {
1488    
1489     }
1490    
1491 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1492    
1493     $self->{last_stag_name} = $self->{ct}->{tag_name};
1494     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1495     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1496     if ($self->{ct}->{attributes}) {
1497    
1498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1499     } else {
1500     ## NOTE: This state should never be reached.
1501    
1502     }
1503     } else {
1504     die "$0: $self->{ct}->{type}: Unknown token type";
1505     }
1506     $self->{state} = DATA_STATE;
1507 wakaba 1.5 $self->{s_kwd} = '';
1508 wakaba 1.1
1509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1510     $self->{line_prev} = $self->{line};
1511     $self->{column_prev} = $self->{column};
1512     $self->{column}++;
1513     $self->{nc}
1514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1515     } else {
1516     $self->{set_nc}->($self);
1517     }
1518    
1519    
1520     return ($self->{ct}); # start tag or end tag
1521    
1522     redo A;
1523     } elsif (0x0041 <= $self->{nc} and
1524     $self->{nc} <= 0x005A) { # A..Z
1525    
1526     $self->{ca}
1527 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1528 wakaba 1.1 value => '',
1529     line => $self->{line}, column => $self->{column}};
1530     $self->{state} = ATTRIBUTE_NAME_STATE;
1531    
1532     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1533     $self->{line_prev} = $self->{line};
1534     $self->{column_prev} = $self->{column};
1535     $self->{column}++;
1536     $self->{nc}
1537     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1538     } else {
1539     $self->{set_nc}->($self);
1540     }
1541    
1542     redo A;
1543     } elsif ($self->{nc} == 0x002F) { # /
1544 wakaba 1.11 if ($self->{is_xml}) {
1545    
1546     ## XML5: Not a parse error.
1547     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1548     } else {
1549    
1550     }
1551 wakaba 1.1
1552     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1553    
1554     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1555     $self->{line_prev} = $self->{line};
1556     $self->{column_prev} = $self->{column};
1557     $self->{column}++;
1558     $self->{nc}
1559     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1560     } else {
1561     $self->{set_nc}->($self);
1562     }
1563    
1564     redo A;
1565     } elsif ($self->{nc} == -1) {
1566     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1567     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1568    
1569     $self->{last_stag_name} = $self->{ct}->{tag_name};
1570     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1571     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1572     if ($self->{ct}->{attributes}) {
1573    
1574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1575     } else {
1576     ## NOTE: This state should never be reached.
1577    
1578     }
1579     } else {
1580     die "$0: $self->{ct}->{type}: Unknown token type";
1581     }
1582 wakaba 1.5 $self->{s_kwd} = '';
1583 wakaba 1.1 $self->{state} = DATA_STATE;
1584     # reconsume
1585    
1586     return ($self->{ct}); # start tag or end tag
1587    
1588     redo A;
1589     } else {
1590 wakaba 1.11 if ($self->{is_xml}) {
1591    
1592     ## XML5: Not a parse error.
1593     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1594     } else {
1595    
1596     }
1597    
1598 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1599     $self->{nc} == 0x0027) { # '
1600    
1601 wakaba 1.11 ## XML5: Not a parse error.
1602 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1603     } else {
1604    
1605     }
1606     $self->{ca}
1607     = {name => chr ($self->{nc}),
1608     value => '',
1609     line => $self->{line}, column => $self->{column}};
1610     $self->{state} = ATTRIBUTE_NAME_STATE;
1611    
1612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1613     $self->{line_prev} = $self->{line};
1614     $self->{column_prev} = $self->{column};
1615     $self->{column}++;
1616     $self->{nc}
1617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1618     } else {
1619     $self->{set_nc}->($self);
1620     }
1621    
1622     redo A;
1623     }
1624     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1625 wakaba 1.11 ## XML5: "Tag attribute value before state".
1626    
1627 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1628    
1629     ## Stay in the state
1630    
1631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1632     $self->{line_prev} = $self->{line};
1633     $self->{column_prev} = $self->{column};
1634     $self->{column}++;
1635     $self->{nc}
1636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1637     } else {
1638     $self->{set_nc}->($self);
1639     }
1640    
1641     redo A;
1642     } elsif ($self->{nc} == 0x0022) { # "
1643    
1644     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1645    
1646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1647     $self->{line_prev} = $self->{line};
1648     $self->{column_prev} = $self->{column};
1649     $self->{column}++;
1650     $self->{nc}
1651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1652     } else {
1653     $self->{set_nc}->($self);
1654     }
1655    
1656     redo A;
1657     } elsif ($self->{nc} == 0x0026) { # &
1658    
1659     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1660     ## reconsume
1661     redo A;
1662     } elsif ($self->{nc} == 0x0027) { # '
1663    
1664     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1665    
1666     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1667     $self->{line_prev} = $self->{line};
1668     $self->{column_prev} = $self->{column};
1669     $self->{column}++;
1670     $self->{nc}
1671     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1672     } else {
1673     $self->{set_nc}->($self);
1674     }
1675    
1676     redo A;
1677     } elsif ($self->{nc} == 0x003E) { # >
1678     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1679     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1680    
1681     $self->{last_stag_name} = $self->{ct}->{tag_name};
1682     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1683     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1684     if ($self->{ct}->{attributes}) {
1685    
1686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1687     } else {
1688     ## NOTE: This state should never be reached.
1689    
1690     }
1691     } else {
1692     die "$0: $self->{ct}->{type}: Unknown token type";
1693     }
1694     $self->{state} = DATA_STATE;
1695 wakaba 1.5 $self->{s_kwd} = '';
1696 wakaba 1.1
1697     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1698     $self->{line_prev} = $self->{line};
1699     $self->{column_prev} = $self->{column};
1700     $self->{column}++;
1701     $self->{nc}
1702     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1703     } else {
1704     $self->{set_nc}->($self);
1705     }
1706    
1707    
1708     return ($self->{ct}); # start tag or end tag
1709    
1710     redo A;
1711     } elsif ($self->{nc} == -1) {
1712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1713     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1714    
1715     $self->{last_stag_name} = $self->{ct}->{tag_name};
1716     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1717     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1718     if ($self->{ct}->{attributes}) {
1719    
1720     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1721     } else {
1722     ## NOTE: This state should never be reached.
1723    
1724     }
1725     } else {
1726     die "$0: $self->{ct}->{type}: Unknown token type";
1727     }
1728     $self->{state} = DATA_STATE;
1729 wakaba 1.5 $self->{s_kwd} = '';
1730 wakaba 1.1 ## reconsume
1731    
1732     return ($self->{ct}); # start tag or end tag
1733    
1734     redo A;
1735     } else {
1736     if ($self->{nc} == 0x003D) { # =
1737    
1738 wakaba 1.11 ## XML5: Not a parse error.
1739 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1740 wakaba 1.11 } elsif ($self->{is_xml}) {
1741    
1742     ## XML5: No parse error.
1743     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1744 wakaba 1.1 } else {
1745    
1746     }
1747     $self->{ca}->{value} .= chr ($self->{nc});
1748     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1749    
1750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1751     $self->{line_prev} = $self->{line};
1752     $self->{column_prev} = $self->{column};
1753     $self->{column}++;
1754     $self->{nc}
1755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1756     } else {
1757     $self->{set_nc}->($self);
1758     }
1759    
1760     redo A;
1761     }
1762     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1763 wakaba 1.15 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1764     ## ATTLIST attribute value double quoted state".
1765 wakaba 1.11
1766 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1767 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1768    
1769     ## XML5: "DOCTYPE ATTLIST name after state".
1770     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1771     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1772     } else {
1773    
1774     ## XML5: "Tag attribute name before state".
1775     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1776     }
1777 wakaba 1.1
1778     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1779     $self->{line_prev} = $self->{line};
1780     $self->{column_prev} = $self->{column};
1781     $self->{column}++;
1782     $self->{nc}
1783     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1784     } else {
1785     $self->{set_nc}->($self);
1786     }
1787    
1788     redo A;
1789     } elsif ($self->{nc} == 0x0026) { # &
1790    
1791 wakaba 1.11 ## XML5: Not defined yet.
1792    
1793 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1794     ## "entity in attribute value state". In this implementation, the
1795     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1796     ## implementation of the "consume a character reference" algorithm.
1797     $self->{prev_state} = $self->{state};
1798     $self->{entity_add} = 0x0022; # "
1799     $self->{state} = ENTITY_STATE;
1800    
1801     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1802     $self->{line_prev} = $self->{line};
1803     $self->{column_prev} = $self->{column};
1804     $self->{column}++;
1805     $self->{nc}
1806     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1807     } else {
1808     $self->{set_nc}->($self);
1809     }
1810    
1811     redo A;
1812     } elsif ($self->{nc} == -1) {
1813     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1814     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1815    
1816     $self->{last_stag_name} = $self->{ct}->{tag_name};
1817 wakaba 1.15
1818     $self->{state} = DATA_STATE;
1819     $self->{s_kwd} = '';
1820     ## reconsume
1821     return ($self->{ct}); # start tag
1822     redo A;
1823 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1824     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1825     if ($self->{ct}->{attributes}) {
1826    
1827     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1828     } else {
1829     ## NOTE: This state should never be reached.
1830    
1831     }
1832 wakaba 1.15
1833     $self->{state} = DATA_STATE;
1834     $self->{s_kwd} = '';
1835     ## reconsume
1836     return ($self->{ct}); # end tag
1837     redo A;
1838     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1839     ## XML5: No parse error above; not defined yet.
1840     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1841     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1842     ## Reconsume.
1843     return ($self->{ct}); # ATTLIST
1844     redo A;
1845 wakaba 1.1 } else {
1846     die "$0: $self->{ct}->{type}: Unknown token type";
1847     }
1848     } else {
1849 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1850 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1851    
1852     ## XML5: Not a parse error.
1853     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1854     } else {
1855    
1856     }
1857 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1858     $self->{read_until}->($self->{ca}->{value},
1859 wakaba 1.11 q["&<],
1860 wakaba 1.1 length $self->{ca}->{value});
1861    
1862     ## Stay in the state
1863    
1864     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1865     $self->{line_prev} = $self->{line};
1866     $self->{column_prev} = $self->{column};
1867     $self->{column}++;
1868     $self->{nc}
1869     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1870     } else {
1871     $self->{set_nc}->($self);
1872     }
1873    
1874     redo A;
1875     }
1876     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1877 wakaba 1.15 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1878     ## ATTLIST attribute value single quoted state".
1879 wakaba 1.11
1880 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1881 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1882    
1883     ## XML5: "DOCTYPE ATTLIST name after state".
1884     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1885     $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1886     } else {
1887    
1888     ## XML5: "Before attribute name state" (sic).
1889     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1890     }
1891 wakaba 1.1
1892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1893     $self->{line_prev} = $self->{line};
1894     $self->{column_prev} = $self->{column};
1895     $self->{column}++;
1896     $self->{nc}
1897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1898     } else {
1899     $self->{set_nc}->($self);
1900     }
1901    
1902     redo A;
1903     } elsif ($self->{nc} == 0x0026) { # &
1904    
1905 wakaba 1.11 ## XML5: Not defined yet.
1906    
1907 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1908     ## "entity in attribute value state". In this implementation, the
1909     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1910     ## implementation of the "consume a character reference" algorithm.
1911     $self->{entity_add} = 0x0027; # '
1912     $self->{prev_state} = $self->{state};
1913     $self->{state} = ENTITY_STATE;
1914    
1915     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1916     $self->{line_prev} = $self->{line};
1917     $self->{column_prev} = $self->{column};
1918     $self->{column}++;
1919     $self->{nc}
1920     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1921     } else {
1922     $self->{set_nc}->($self);
1923     }
1924    
1925     redo A;
1926     } elsif ($self->{nc} == -1) {
1927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1928     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1929    
1930     $self->{last_stag_name} = $self->{ct}->{tag_name};
1931 wakaba 1.15
1932     $self->{state} = DATA_STATE;
1933     $self->{s_kwd} = '';
1934     ## reconsume
1935     return ($self->{ct}); # start tag
1936     redo A;
1937 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1938     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1939     if ($self->{ct}->{attributes}) {
1940    
1941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1942     } else {
1943     ## NOTE: This state should never be reached.
1944    
1945     }
1946 wakaba 1.15
1947     $self->{state} = DATA_STATE;
1948     $self->{s_kwd} = '';
1949     ## reconsume
1950     return ($self->{ct}); # end tag
1951     redo A;
1952     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1953     ## XML5: No parse error above; not defined yet.
1954     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1955     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1956     ## Reconsume.
1957     return ($self->{ct}); # ATTLIST
1958     redo A;
1959 wakaba 1.1 } else {
1960     die "$0: $self->{ct}->{type}: Unknown token type";
1961     }
1962     } else {
1963 wakaba 1.15 ## XML5 [ATTLIST]: Not defined yet.
1964 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1965    
1966     ## XML5: Not a parse error.
1967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1968     } else {
1969    
1970     }
1971 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1972     $self->{read_until}->($self->{ca}->{value},
1973 wakaba 1.11 q['&<],
1974 wakaba 1.1 length $self->{ca}->{value});
1975    
1976     ## Stay in the state
1977    
1978     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1979     $self->{line_prev} = $self->{line};
1980     $self->{column_prev} = $self->{column};
1981     $self->{column}++;
1982     $self->{nc}
1983     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1984     } else {
1985     $self->{set_nc}->($self);
1986     }
1987    
1988     redo A;
1989     }
1990     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1991 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1992    
1993 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1994 wakaba 1.15 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1995    
1996     push @{$self->{ct}->{attrdefs}}, $self->{ca};
1997     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1998     } else {
1999    
2000     ## XML5: "Tag attribute name before state".
2001     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2002     }
2003 wakaba 1.1
2004     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2005     $self->{line_prev} = $self->{line};
2006     $self->{column_prev} = $self->{column};
2007     $self->{column}++;
2008     $self->{nc}
2009     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2010     } else {
2011     $self->{set_nc}->($self);
2012     }
2013    
2014     redo A;
2015     } elsif ($self->{nc} == 0x0026) { # &
2016    
2017 wakaba 1.11
2018     ## XML5: Not defined yet.
2019    
2020 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
2021     ## "entity in attribute value state". In this implementation, the
2022     ## tokenizer is switched to the |ENTITY_STATE|, which is an
2023     ## implementation of the "consume a character reference" algorithm.
2024     $self->{entity_add} = -1;
2025     $self->{prev_state} = $self->{state};
2026     $self->{state} = ENTITY_STATE;
2027    
2028     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2029     $self->{line_prev} = $self->{line};
2030     $self->{column_prev} = $self->{column};
2031     $self->{column}++;
2032     $self->{nc}
2033     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2034     } else {
2035     $self->{set_nc}->($self);
2036     }
2037    
2038     redo A;
2039     } elsif ($self->{nc} == 0x003E) { # >
2040     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2041    
2042     $self->{last_stag_name} = $self->{ct}->{tag_name};
2043 wakaba 1.15
2044     $self->{state} = DATA_STATE;
2045     $self->{s_kwd} = '';
2046    
2047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2048     $self->{line_prev} = $self->{line};
2049     $self->{column_prev} = $self->{column};
2050     $self->{column}++;
2051     $self->{nc}
2052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2053     } else {
2054     $self->{set_nc}->($self);
2055     }
2056    
2057     return ($self->{ct}); # start tag
2058     redo A;
2059 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2060     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2061     if ($self->{ct}->{attributes}) {
2062    
2063     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2064     } else {
2065     ## NOTE: This state should never be reached.
2066    
2067     }
2068 wakaba 1.15
2069     $self->{state} = DATA_STATE;
2070     $self->{s_kwd} = '';
2071    
2072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2073     $self->{line_prev} = $self->{line};
2074     $self->{column_prev} = $self->{column};
2075     $self->{column}++;
2076     $self->{nc}
2077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2078     } else {
2079     $self->{set_nc}->($self);
2080     }
2081    
2082     return ($self->{ct}); # end tag
2083     redo A;
2084     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2085     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2086     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2087    
2088 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089     $self->{line_prev} = $self->{line};
2090     $self->{column_prev} = $self->{column};
2091     $self->{column}++;
2092     $self->{nc}
2093     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094     } else {
2095     $self->{set_nc}->($self);
2096     }
2097    
2098 wakaba 1.15 return ($self->{ct}); # ATTLIST
2099     redo A;
2100     } else {
2101     die "$0: $self->{ct}->{type}: Unknown token type";
2102     }
2103 wakaba 1.1 } elsif ($self->{nc} == -1) {
2104     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2105    
2106 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2107 wakaba 1.1 $self->{last_stag_name} = $self->{ct}->{tag_name};
2108 wakaba 1.15
2109     $self->{state} = DATA_STATE;
2110     $self->{s_kwd} = '';
2111     ## reconsume
2112     return ($self->{ct}); # start tag
2113     redo A;
2114 wakaba 1.1 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2115 wakaba 1.15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2116 wakaba 1.1 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2117     if ($self->{ct}->{attributes}) {
2118    
2119     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2120     } else {
2121     ## NOTE: This state should never be reached.
2122    
2123     }
2124 wakaba 1.15
2125     $self->{state} = DATA_STATE;
2126     $self->{s_kwd} = '';
2127     ## reconsume
2128     return ($self->{ct}); # end tag
2129     redo A;
2130     } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2131     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2132     push @{$self->{ct}->{attrdefs}}, $self->{ca};
2133     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2134     ## Reconsume.
2135     return ($self->{ct}); # ATTLIST
2136     redo A;
2137 wakaba 1.1 } else {
2138     die "$0: $self->{ct}->{type}: Unknown token type";
2139     }
2140     } else {
2141     if ({
2142     0x0022 => 1, # "
2143     0x0027 => 1, # '
2144     0x003D => 1, # =
2145     }->{$self->{nc}}) {
2146    
2147 wakaba 1.11 ## XML5: Not a parse error.
2148 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2149     } else {
2150    
2151     }
2152     $self->{ca}->{value} .= chr ($self->{nc});
2153     $self->{read_until}->($self->{ca}->{value},
2154     q["'=& >],
2155     length $self->{ca}->{value});
2156    
2157     ## Stay in the state
2158    
2159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2160     $self->{line_prev} = $self->{line};
2161     $self->{column_prev} = $self->{column};
2162     $self->{column}++;
2163     $self->{nc}
2164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2165     } else {
2166     $self->{set_nc}->($self);
2167     }
2168    
2169     redo A;
2170     }
2171     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2172     if ($is_space->{$self->{nc}}) {
2173    
2174     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2175    
2176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2177     $self->{line_prev} = $self->{line};
2178     $self->{column_prev} = $self->{column};
2179     $self->{column}++;
2180     $self->{nc}
2181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2182     } else {
2183     $self->{set_nc}->($self);
2184     }
2185    
2186     redo A;
2187     } elsif ($self->{nc} == 0x003E) { # >
2188     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2189    
2190     $self->{last_stag_name} = $self->{ct}->{tag_name};
2191     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2192     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2193     if ($self->{ct}->{attributes}) {
2194    
2195     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2196     } else {
2197     ## NOTE: This state should never be reached.
2198    
2199     }
2200     } else {
2201     die "$0: $self->{ct}->{type}: Unknown token type";
2202     }
2203     $self->{state} = DATA_STATE;
2204 wakaba 1.5 $self->{s_kwd} = '';
2205 wakaba 1.1
2206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2207     $self->{line_prev} = $self->{line};
2208     $self->{column_prev} = $self->{column};
2209     $self->{column}++;
2210     $self->{nc}
2211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2212     } else {
2213     $self->{set_nc}->($self);
2214     }
2215    
2216    
2217     return ($self->{ct}); # start tag or end tag
2218    
2219     redo A;
2220     } elsif ($self->{nc} == 0x002F) { # /
2221    
2222     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2223    
2224     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2225     $self->{line_prev} = $self->{line};
2226     $self->{column_prev} = $self->{column};
2227     $self->{column}++;
2228     $self->{nc}
2229     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2230     } else {
2231     $self->{set_nc}->($self);
2232     }
2233    
2234     redo A;
2235     } elsif ($self->{nc} == -1) {
2236     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2237     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2238    
2239     $self->{last_stag_name} = $self->{ct}->{tag_name};
2240     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2241     if ($self->{ct}->{attributes}) {
2242    
2243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2244     } else {
2245     ## NOTE: This state should never be reached.
2246    
2247     }
2248     } else {
2249     die "$0: $self->{ct}->{type}: Unknown token type";
2250     }
2251     $self->{state} = DATA_STATE;
2252 wakaba 1.5 $self->{s_kwd} = '';
2253 wakaba 1.1 ## Reconsume.
2254     return ($self->{ct}); # start tag or end tag
2255     redo A;
2256     } else {
2257    
2258     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2259     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2260     ## reconsume
2261     redo A;
2262     }
2263     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2264 wakaba 1.11 ## XML5: "Empty tag state".
2265    
2266 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2267     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2268    
2269     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2270     ## TODO: Different type than slash in start tag
2271     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2272     if ($self->{ct}->{attributes}) {
2273    
2274     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2275     } else {
2276    
2277     }
2278     ## TODO: Test |<title></title/>|
2279     } else {
2280    
2281     $self->{self_closing} = 1;
2282     }
2283    
2284     $self->{state} = DATA_STATE;
2285 wakaba 1.5 $self->{s_kwd} = '';
2286 wakaba 1.1
2287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2288     $self->{line_prev} = $self->{line};
2289     $self->{column_prev} = $self->{column};
2290     $self->{column}++;
2291     $self->{nc}
2292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2293     } else {
2294     $self->{set_nc}->($self);
2295     }
2296    
2297    
2298     return ($self->{ct}); # start tag or end tag
2299    
2300     redo A;
2301     } elsif ($self->{nc} == -1) {
2302     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2303     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2304    
2305     $self->{last_stag_name} = $self->{ct}->{tag_name};
2306     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2307     if ($self->{ct}->{attributes}) {
2308    
2309     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2310     } else {
2311     ## NOTE: This state should never be reached.
2312    
2313     }
2314     } else {
2315     die "$0: $self->{ct}->{type}: Unknown token type";
2316     }
2317 wakaba 1.11 ## XML5: "Tag attribute name before state".
2318 wakaba 1.1 $self->{state} = DATA_STATE;
2319 wakaba 1.5 $self->{s_kwd} = '';
2320 wakaba 1.1 ## Reconsume.
2321     return ($self->{ct}); # start tag or end tag
2322     redo A;
2323     } else {
2324    
2325     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2326     ## TODO: This error type is wrong.
2327     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2328     ## Reconsume.
2329     redo A;
2330     }
2331     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2332 wakaba 1.14 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2333    
2334 wakaba 1.1 ## NOTE: Unlike spec's "bogus comment state", this implementation
2335     ## consumes characters one-by-one basis.
2336    
2337     if ($self->{nc} == 0x003E) { # >
2338 wakaba 1.13 if ($self->{in_subset}) {
2339    
2340     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2341     } else {
2342    
2343     $self->{state} = DATA_STATE;
2344     $self->{s_kwd} = '';
2345     }
2346 wakaba 1.1
2347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2348     $self->{line_prev} = $self->{line};
2349     $self->{column_prev} = $self->{column};
2350     $self->{column}++;
2351     $self->{nc}
2352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2353     } else {
2354     $self->{set_nc}->($self);
2355     }
2356    
2357    
2358     return ($self->{ct}); # comment
2359     redo A;
2360     } elsif ($self->{nc} == -1) {
2361 wakaba 1.13 if ($self->{in_subset}) {
2362    
2363     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2364     } else {
2365    
2366     $self->{state} = DATA_STATE;
2367     $self->{s_kwd} = '';
2368     }
2369 wakaba 1.1 ## reconsume
2370    
2371     return ($self->{ct}); # comment
2372     redo A;
2373     } else {
2374    
2375     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2376     $self->{read_until}->($self->{ct}->{data},
2377     q[>],
2378     length $self->{ct}->{data});
2379    
2380     ## Stay in the state.
2381    
2382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2383     $self->{line_prev} = $self->{line};
2384     $self->{column_prev} = $self->{column};
2385     $self->{column}++;
2386     $self->{nc}
2387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2388     } else {
2389     $self->{set_nc}->($self);
2390     }
2391    
2392     redo A;
2393     }
2394     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2395 wakaba 1.14 ## XML5: "Markup declaration state".
2396 wakaba 1.1
2397     if ($self->{nc} == 0x002D) { # -
2398    
2399     $self->{state} = MD_HYPHEN_STATE;
2400    
2401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2402     $self->{line_prev} = $self->{line};
2403     $self->{column_prev} = $self->{column};
2404     $self->{column}++;
2405     $self->{nc}
2406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2407     } else {
2408     $self->{set_nc}->($self);
2409     }
2410    
2411     redo A;
2412     } elsif ($self->{nc} == 0x0044 or # D
2413     $self->{nc} == 0x0064) { # d
2414     ## ASCII case-insensitive.
2415    
2416     $self->{state} = MD_DOCTYPE_STATE;
2417 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2418 wakaba 1.1
2419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2420     $self->{line_prev} = $self->{line};
2421     $self->{column_prev} = $self->{column};
2422     $self->{column}++;
2423     $self->{nc}
2424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2425     } else {
2426     $self->{set_nc}->($self);
2427     }
2428    
2429     redo A;
2430 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2431     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2432     $self->{is_xml}) and
2433 wakaba 1.1 $self->{nc} == 0x005B) { # [
2434    
2435     $self->{state} = MD_CDATA_STATE;
2436 wakaba 1.12 $self->{kwd} = '[';
2437 wakaba 1.1
2438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2439     $self->{line_prev} = $self->{line};
2440     $self->{column_prev} = $self->{column};
2441     $self->{column}++;
2442     $self->{nc}
2443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2444     } else {
2445     $self->{set_nc}->($self);
2446     }
2447    
2448     redo A;
2449     } else {
2450    
2451     }
2452    
2453     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2454     line => $self->{line_prev},
2455     column => $self->{column_prev} - 1);
2456     ## Reconsume.
2457     $self->{state} = BOGUS_COMMENT_STATE;
2458     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2459     line => $self->{line_prev},
2460     column => $self->{column_prev} - 1,
2461     };
2462     redo A;
2463     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2464     if ($self->{nc} == 0x002D) { # -
2465    
2466     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2467     line => $self->{line_prev},
2468     column => $self->{column_prev} - 2,
2469     };
2470 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2471 wakaba 1.1
2472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2473     $self->{line_prev} = $self->{line};
2474     $self->{column_prev} = $self->{column};
2475     $self->{column}++;
2476     $self->{nc}
2477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2478     } else {
2479     $self->{set_nc}->($self);
2480     }
2481    
2482     redo A;
2483     } else {
2484    
2485     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2486     line => $self->{line_prev},
2487     column => $self->{column_prev} - 2);
2488     $self->{state} = BOGUS_COMMENT_STATE;
2489     ## Reconsume.
2490     $self->{ct} = {type => COMMENT_TOKEN,
2491     data => '-',
2492     line => $self->{line_prev},
2493     column => $self->{column_prev} - 2,
2494     };
2495     redo A;
2496     }
2497     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2498     ## ASCII case-insensitive.
2499     if ($self->{nc} == [
2500     undef,
2501     0x004F, # O
2502     0x0043, # C
2503     0x0054, # T
2504     0x0059, # Y
2505     0x0050, # P
2506 wakaba 1.12 ]->[length $self->{kwd}] or
2507 wakaba 1.1 $self->{nc} == [
2508     undef,
2509     0x006F, # o
2510     0x0063, # c
2511     0x0074, # t
2512     0x0079, # y
2513     0x0070, # p
2514 wakaba 1.12 ]->[length $self->{kwd}]) {
2515 wakaba 1.1
2516     ## Stay in the state.
2517 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2518 wakaba 1.1
2519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2520     $self->{line_prev} = $self->{line};
2521     $self->{column_prev} = $self->{column};
2522     $self->{column}++;
2523     $self->{nc}
2524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2525     } else {
2526     $self->{set_nc}->($self);
2527     }
2528    
2529     redo A;
2530 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
2531 wakaba 1.1 ($self->{nc} == 0x0045 or # E
2532     $self->{nc} == 0x0065)) { # e
2533 wakaba 1.12 if ($self->{is_xml} and
2534     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2535 wakaba 1.10
2536     ## XML5: case-sensitive.
2537     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2538     text => 'DOCTYPE',
2539     line => $self->{line_prev},
2540     column => $self->{column_prev} - 5);
2541     } else {
2542    
2543     }
2544 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2545     $self->{ct} = {type => DOCTYPE_TOKEN,
2546     quirks => 1,
2547     line => $self->{line_prev},
2548     column => $self->{column_prev} - 7,
2549     };
2550    
2551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2552     $self->{line_prev} = $self->{line};
2553     $self->{column_prev} = $self->{column};
2554     $self->{column}++;
2555     $self->{nc}
2556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2557     } else {
2558     $self->{set_nc}->($self);
2559     }
2560    
2561     redo A;
2562     } else {
2563    
2564     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2565     line => $self->{line_prev},
2566 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2567 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2568     ## Reconsume.
2569     $self->{ct} = {type => COMMENT_TOKEN,
2570 wakaba 1.12 data => $self->{kwd},
2571 wakaba 1.1 line => $self->{line_prev},
2572 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2573 wakaba 1.1 };
2574     redo A;
2575     }
2576     } elsif ($self->{state} == MD_CDATA_STATE) {
2577     if ($self->{nc} == {
2578     '[' => 0x0043, # C
2579     '[C' => 0x0044, # D
2580     '[CD' => 0x0041, # A
2581     '[CDA' => 0x0054, # T
2582     '[CDAT' => 0x0041, # A
2583 wakaba 1.12 }->{$self->{kwd}}) {
2584 wakaba 1.1
2585     ## Stay in the state.
2586 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2587 wakaba 1.1
2588     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2589     $self->{line_prev} = $self->{line};
2590     $self->{column_prev} = $self->{column};
2591     $self->{column}++;
2592     $self->{nc}
2593     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2594     } else {
2595     $self->{set_nc}->($self);
2596     }
2597    
2598     redo A;
2599 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
2600 wakaba 1.1 $self->{nc} == 0x005B) { # [
2601 wakaba 1.6 if ($self->{is_xml} and
2602     not $self->{tainted} and
2603     @{$self->{open_elements} or []} == 0) {
2604 wakaba 1.8
2605 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2606     line => $self->{line_prev},
2607     column => $self->{column_prev} - 7);
2608     $self->{tainted} = 1;
2609 wakaba 1.8 } else {
2610    
2611 wakaba 1.6 }
2612    
2613 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2614     data => '',
2615     line => $self->{line_prev},
2616     column => $self->{column_prev} - 7};
2617     $self->{state} = CDATA_SECTION_STATE;
2618    
2619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2620     $self->{line_prev} = $self->{line};
2621     $self->{column_prev} = $self->{column};
2622     $self->{column}++;
2623     $self->{nc}
2624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2625     } else {
2626     $self->{set_nc}->($self);
2627     }
2628    
2629     redo A;
2630     } else {
2631    
2632     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2633     line => $self->{line_prev},
2634 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
2635 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
2636     ## Reconsume.
2637     $self->{ct} = {type => COMMENT_TOKEN,
2638 wakaba 1.12 data => $self->{kwd},
2639 wakaba 1.1 line => $self->{line_prev},
2640 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
2641 wakaba 1.1 };
2642     redo A;
2643     }
2644     } elsif ($self->{state} == COMMENT_START_STATE) {
2645     if ($self->{nc} == 0x002D) { # -
2646    
2647     $self->{state} = COMMENT_START_DASH_STATE;
2648    
2649     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2650     $self->{line_prev} = $self->{line};
2651     $self->{column_prev} = $self->{column};
2652     $self->{column}++;
2653     $self->{nc}
2654     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2655     } else {
2656     $self->{set_nc}->($self);
2657     }
2658    
2659     redo A;
2660     } elsif ($self->{nc} == 0x003E) { # >
2661     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2662 wakaba 1.13 if ($self->{in_subset}) {
2663    
2664     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2665     } else {
2666    
2667     $self->{state} = DATA_STATE;
2668     $self->{s_kwd} = '';
2669     }
2670 wakaba 1.1
2671     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2672     $self->{line_prev} = $self->{line};
2673     $self->{column_prev} = $self->{column};
2674     $self->{column}++;
2675     $self->{nc}
2676     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2677     } else {
2678     $self->{set_nc}->($self);
2679     }
2680    
2681    
2682     return ($self->{ct}); # comment
2683    
2684     redo A;
2685     } elsif ($self->{nc} == -1) {
2686     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2687 wakaba 1.13 if ($self->{in_subset}) {
2688    
2689     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2690     } else {
2691    
2692     $self->{state} = DATA_STATE;
2693     $self->{s_kwd} = '';
2694     }
2695 wakaba 1.1 ## reconsume
2696    
2697     return ($self->{ct}); # comment
2698    
2699     redo A;
2700     } else {
2701    
2702     $self->{ct}->{data} # comment
2703     .= chr ($self->{nc});
2704     $self->{state} = COMMENT_STATE;
2705    
2706     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2707     $self->{line_prev} = $self->{line};
2708     $self->{column_prev} = $self->{column};
2709     $self->{column}++;
2710     $self->{nc}
2711     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2712     } else {
2713     $self->{set_nc}->($self);
2714     }
2715    
2716     redo A;
2717     }
2718     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2719     if ($self->{nc} == 0x002D) { # -
2720    
2721     $self->{state} = COMMENT_END_STATE;
2722    
2723     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2724     $self->{line_prev} = $self->{line};
2725     $self->{column_prev} = $self->{column};
2726     $self->{column}++;
2727     $self->{nc}
2728     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2729     } else {
2730     $self->{set_nc}->($self);
2731     }
2732    
2733     redo A;
2734     } elsif ($self->{nc} == 0x003E) { # >
2735     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2736 wakaba 1.13 if ($self->{in_subset}) {
2737    
2738     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2739     } else {
2740    
2741     $self->{state} = DATA_STATE;
2742     $self->{s_kwd} = '';
2743     }
2744 wakaba 1.1
2745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2746     $self->{line_prev} = $self->{line};
2747     $self->{column_prev} = $self->{column};
2748     $self->{column}++;
2749     $self->{nc}
2750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2751     } else {
2752     $self->{set_nc}->($self);
2753     }
2754    
2755    
2756     return ($self->{ct}); # comment
2757    
2758     redo A;
2759     } elsif ($self->{nc} == -1) {
2760     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2761 wakaba 1.13 if ($self->{in_subset}) {
2762    
2763     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2764     } else {
2765    
2766     $self->{state} = DATA_STATE;
2767     $self->{s_kwd} = '';
2768     }
2769 wakaba 1.1 ## reconsume
2770    
2771     return ($self->{ct}); # comment
2772    
2773     redo A;
2774     } else {
2775    
2776     $self->{ct}->{data} # comment
2777     .= '-' . chr ($self->{nc});
2778     $self->{state} = COMMENT_STATE;
2779    
2780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2781     $self->{line_prev} = $self->{line};
2782     $self->{column_prev} = $self->{column};
2783     $self->{column}++;
2784     $self->{nc}
2785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2786     } else {
2787     $self->{set_nc}->($self);
2788     }
2789    
2790     redo A;
2791     }
2792     } elsif ($self->{state} == COMMENT_STATE) {
2793 wakaba 1.14 ## XML5: "Comment state" and "DOCTYPE comment state".
2794    
2795 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2796    
2797     $self->{state} = COMMENT_END_DASH_STATE;
2798    
2799     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2800     $self->{line_prev} = $self->{line};
2801     $self->{column_prev} = $self->{column};
2802     $self->{column}++;
2803     $self->{nc}
2804     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2805     } else {
2806     $self->{set_nc}->($self);
2807     }
2808    
2809     redo A;
2810     } elsif ($self->{nc} == -1) {
2811     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2812 wakaba 1.13 if ($self->{in_subset}) {
2813    
2814     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2815     } else {
2816    
2817     $self->{state} = DATA_STATE;
2818     $self->{s_kwd} = '';
2819     }
2820 wakaba 1.1 ## reconsume
2821    
2822     return ($self->{ct}); # comment
2823    
2824     redo A;
2825     } else {
2826    
2827     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2828     $self->{read_until}->($self->{ct}->{data},
2829     q[-],
2830     length $self->{ct}->{data});
2831    
2832     ## Stay in the state
2833    
2834     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2835     $self->{line_prev} = $self->{line};
2836     $self->{column_prev} = $self->{column};
2837     $self->{column}++;
2838     $self->{nc}
2839     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2840     } else {
2841     $self->{set_nc}->($self);
2842     }
2843    
2844     redo A;
2845     }
2846     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2847 wakaba 1.14 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2848 wakaba 1.10
2849 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2850    
2851     $self->{state} = COMMENT_END_STATE;
2852    
2853     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2854     $self->{line_prev} = $self->{line};
2855     $self->{column_prev} = $self->{column};
2856     $self->{column}++;
2857     $self->{nc}
2858     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2859     } else {
2860     $self->{set_nc}->($self);
2861     }
2862    
2863     redo A;
2864     } elsif ($self->{nc} == -1) {
2865     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2866 wakaba 1.13 if ($self->{in_subset}) {
2867    
2868     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2869     } else {
2870    
2871     $self->{state} = DATA_STATE;
2872     $self->{s_kwd} = '';
2873     }
2874 wakaba 1.1 ## reconsume
2875    
2876     return ($self->{ct}); # comment
2877    
2878     redo A;
2879     } else {
2880    
2881     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2882     $self->{state} = COMMENT_STATE;
2883    
2884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2885     $self->{line_prev} = $self->{line};
2886     $self->{column_prev} = $self->{column};
2887     $self->{column}++;
2888     $self->{nc}
2889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2890     } else {
2891     $self->{set_nc}->($self);
2892     }
2893    
2894     redo A;
2895     }
2896     } elsif ($self->{state} == COMMENT_END_STATE) {
2897 wakaba 1.14 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2898    
2899 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2900 wakaba 1.13 if ($self->{in_subset}) {
2901    
2902     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2903     } else {
2904    
2905     $self->{state} = DATA_STATE;
2906     $self->{s_kwd} = '';
2907     }
2908 wakaba 1.1
2909     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2910     $self->{line_prev} = $self->{line};
2911     $self->{column_prev} = $self->{column};
2912     $self->{column}++;
2913     $self->{nc}
2914     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2915     } else {
2916     $self->{set_nc}->($self);
2917     }
2918    
2919    
2920     return ($self->{ct}); # comment
2921    
2922     redo A;
2923     } elsif ($self->{nc} == 0x002D) { # -
2924    
2925 wakaba 1.10 ## XML5: Not a parse error.
2926 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2927     line => $self->{line_prev},
2928     column => $self->{column_prev});
2929     $self->{ct}->{data} .= '-'; # comment
2930     ## Stay in the state
2931    
2932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2933     $self->{line_prev} = $self->{line};
2934     $self->{column_prev} = $self->{column};
2935     $self->{column}++;
2936     $self->{nc}
2937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2938     } else {
2939     $self->{set_nc}->($self);
2940     }
2941    
2942     redo A;
2943     } elsif ($self->{nc} == -1) {
2944     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2945 wakaba 1.13 if ($self->{in_subset}) {
2946    
2947     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2948     } else {
2949    
2950     $self->{state} = DATA_STATE;
2951     $self->{s_kwd} = '';
2952     }
2953 wakaba 1.1 ## reconsume
2954    
2955     return ($self->{ct}); # comment
2956    
2957     redo A;
2958     } else {
2959    
2960 wakaba 1.10 ## XML5: Not a parse error.
2961 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2962     line => $self->{line_prev},
2963     column => $self->{column_prev});
2964     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2965     $self->{state} = COMMENT_STATE;
2966    
2967     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2968     $self->{line_prev} = $self->{line};
2969     $self->{column_prev} = $self->{column};
2970     $self->{column}++;
2971     $self->{nc}
2972     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2973     } else {
2974     $self->{set_nc}->($self);
2975     }
2976    
2977     redo A;
2978     }
2979     } elsif ($self->{state} == DOCTYPE_STATE) {
2980     if ($is_space->{$self->{nc}}) {
2981    
2982     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2983    
2984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2985     $self->{line_prev} = $self->{line};
2986     $self->{column_prev} = $self->{column};
2987     $self->{column}++;
2988     $self->{nc}
2989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2990     } else {
2991     $self->{set_nc}->($self);
2992     }
2993    
2994     redo A;
2995     } else {
2996    
2997 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
2998 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2999     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3000     ## reconsume
3001     redo A;
3002     }
3003     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3004 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
3005    
3006 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3007    
3008     ## Stay in the state
3009    
3010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3011     $self->{line_prev} = $self->{line};
3012     $self->{column_prev} = $self->{column};
3013     $self->{column}++;
3014     $self->{nc}
3015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3016     } else {
3017     $self->{set_nc}->($self);
3018     }
3019    
3020     redo A;
3021     } elsif ($self->{nc} == 0x003E) { # >
3022    
3023 wakaba 1.12 ## XML5: No parse error.
3024 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3025     $self->{state} = DATA_STATE;
3026 wakaba 1.5 $self->{s_kwd} = '';
3027 wakaba 1.1
3028     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3029     $self->{line_prev} = $self->{line};
3030     $self->{column_prev} = $self->{column};
3031     $self->{column}++;
3032     $self->{nc}
3033     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3034     } else {
3035     $self->{set_nc}->($self);
3036     }
3037    
3038    
3039     return ($self->{ct}); # DOCTYPE (quirks)
3040    
3041     redo A;
3042     } elsif ($self->{nc} == -1) {
3043    
3044     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3045     $self->{state} = DATA_STATE;
3046 wakaba 1.5 $self->{s_kwd} = '';
3047 wakaba 1.1 ## reconsume
3048    
3049     return ($self->{ct}); # DOCTYPE (quirks)
3050    
3051     redo A;
3052 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3053    
3054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3055     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3056 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3057     $self->{in_subset} = 1;
3058 wakaba 1.12
3059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3060     $self->{line_prev} = $self->{line};
3061     $self->{column_prev} = $self->{column};
3062     $self->{column}++;
3063     $self->{nc}
3064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3065     } else {
3066     $self->{set_nc}->($self);
3067     }
3068    
3069 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3070 wakaba 1.12 redo A;
3071 wakaba 1.1 } else {
3072    
3073     $self->{ct}->{name} = chr $self->{nc};
3074     delete $self->{ct}->{quirks};
3075     $self->{state} = DOCTYPE_NAME_STATE;
3076    
3077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3078     $self->{line_prev} = $self->{line};
3079     $self->{column_prev} = $self->{column};
3080     $self->{column}++;
3081     $self->{nc}
3082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3083     } else {
3084     $self->{set_nc}->($self);
3085     }
3086    
3087     redo A;
3088     }
3089     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3090 wakaba 1.12 ## XML5: "DOCTYPE root name state".
3091    
3092     ## ISSUE: Redundant "First," in the spec.
3093    
3094 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3095    
3096     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3097    
3098     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3099     $self->{line_prev} = $self->{line};
3100     $self->{column_prev} = $self->{column};
3101     $self->{column}++;
3102     $self->{nc}
3103     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3104     } else {
3105     $self->{set_nc}->($self);
3106     }
3107    
3108     redo A;
3109     } elsif ($self->{nc} == 0x003E) { # >
3110    
3111     $self->{state} = DATA_STATE;
3112 wakaba 1.5 $self->{s_kwd} = '';
3113 wakaba 1.1
3114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3115     $self->{line_prev} = $self->{line};
3116     $self->{column_prev} = $self->{column};
3117     $self->{column}++;
3118     $self->{nc}
3119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3120     } else {
3121     $self->{set_nc}->($self);
3122     }
3123    
3124    
3125     return ($self->{ct}); # DOCTYPE
3126    
3127     redo A;
3128     } elsif ($self->{nc} == -1) {
3129    
3130     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3131     $self->{state} = DATA_STATE;
3132 wakaba 1.5 $self->{s_kwd} = '';
3133 wakaba 1.1 ## reconsume
3134    
3135     $self->{ct}->{quirks} = 1;
3136     return ($self->{ct}); # DOCTYPE
3137    
3138     redo A;
3139 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3140    
3141     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3142 wakaba 1.13 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3143     $self->{in_subset} = 1;
3144 wakaba 1.12
3145     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3146     $self->{line_prev} = $self->{line};
3147     $self->{column_prev} = $self->{column};
3148     $self->{column}++;
3149     $self->{nc}
3150     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3151     } else {
3152     $self->{set_nc}->($self);
3153     }
3154    
3155 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3156 wakaba 1.12 redo A;
3157 wakaba 1.1 } else {
3158    
3159     $self->{ct}->{name}
3160     .= chr ($self->{nc}); # DOCTYPE
3161     ## Stay in the state
3162    
3163     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3164     $self->{line_prev} = $self->{line};
3165     $self->{column_prev} = $self->{column};
3166     $self->{column}++;
3167     $self->{nc}
3168     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3169     } else {
3170     $self->{set_nc}->($self);
3171     }
3172    
3173     redo A;
3174     }
3175     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3176 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3177     ## state", but implemented differently.
3178    
3179 wakaba 1.1 if ($is_space->{$self->{nc}}) {
3180    
3181     ## Stay in the state
3182    
3183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3184     $self->{line_prev} = $self->{line};
3185     $self->{column_prev} = $self->{column};
3186     $self->{column}++;
3187     $self->{nc}
3188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3189     } else {
3190     $self->{set_nc}->($self);
3191     }
3192    
3193     redo A;
3194     } elsif ($self->{nc} == 0x003E) { # >
3195 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3196    
3197     $self->{state} = DATA_STATE;
3198     $self->{s_kwd} = '';
3199     } else {
3200    
3201     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3202     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3203     }
3204 wakaba 1.1
3205    
3206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3207     $self->{line_prev} = $self->{line};
3208     $self->{column_prev} = $self->{column};
3209     $self->{column}++;
3210     $self->{nc}
3211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3212     } else {
3213     $self->{set_nc}->($self);
3214     }
3215    
3216 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3217 wakaba 1.1 redo A;
3218     } elsif ($self->{nc} == -1) {
3219 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3220    
3221     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3222     $self->{state} = DATA_STATE;
3223     $self->{s_kwd} = '';
3224     $self->{ct}->{quirks} = 1;
3225     } else {
3226    
3227     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3228     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3229     }
3230 wakaba 1.1
3231 wakaba 1.16 ## Reconsume.
3232     return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3233 wakaba 1.1 redo A;
3234     } elsif ($self->{nc} == 0x0050 or # P
3235     $self->{nc} == 0x0070) { # p
3236 wakaba 1.12
3237 wakaba 1.1 $self->{state} = PUBLIC_STATE;
3238 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3239 wakaba 1.1
3240     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3241     $self->{line_prev} = $self->{line};
3242     $self->{column_prev} = $self->{column};
3243     $self->{column}++;
3244     $self->{nc}
3245     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3246     } else {
3247     $self->{set_nc}->($self);
3248     }
3249    
3250     redo A;
3251     } elsif ($self->{nc} == 0x0053 or # S
3252     $self->{nc} == 0x0073) { # s
3253 wakaba 1.12
3254 wakaba 1.1 $self->{state} = SYSTEM_STATE;
3255 wakaba 1.12 $self->{kwd} = chr $self->{nc};
3256    
3257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3258     $self->{line_prev} = $self->{line};
3259     $self->{column_prev} = $self->{column};
3260     $self->{column}++;
3261     $self->{nc}
3262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3263     } else {
3264     $self->{set_nc}->($self);
3265     }
3266    
3267     redo A;
3268 wakaba 1.19 } elsif ($self->{nc} == 0x0022 and # "
3269     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3270     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3271    
3272     $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3273     $self->{ct}->{value} = ''; # ENTITY
3274    
3275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3276     $self->{line_prev} = $self->{line};
3277     $self->{column_prev} = $self->{column};
3278     $self->{column}++;
3279     $self->{nc}
3280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3281     } else {
3282     $self->{set_nc}->($self);
3283     }
3284    
3285     redo A;
3286     } elsif ($self->{nc} == 0x0027 and # '
3287     ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3288     $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3289    
3290     $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3291     $self->{ct}->{value} = ''; # ENTITY
3292    
3293     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3294     $self->{line_prev} = $self->{line};
3295     $self->{column_prev} = $self->{column};
3296     $self->{column}++;
3297     $self->{nc}
3298     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3299     } else {
3300     $self->{set_nc}->($self);
3301     }
3302    
3303     redo A;
3304 wakaba 1.16 } elsif ($self->{is_xml} and
3305     $self->{ct}->{type} == DOCTYPE_TOKEN and
3306     $self->{nc} == 0x005B) { # [
3307 wakaba 1.12
3308     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3309     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3310 wakaba 1.13 $self->{in_subset} = 1;
3311 wakaba 1.1
3312     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3313     $self->{line_prev} = $self->{line};
3314     $self->{column_prev} = $self->{column};
3315     $self->{column}++;
3316     $self->{nc}
3317     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3318     } else {
3319     $self->{set_nc}->($self);
3320     }
3321    
3322 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3323 wakaba 1.1 redo A;
3324     } else {
3325 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3326    
3327     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3328    
3329     $self->{ct}->{quirks} = 1;
3330     $self->{state} = BOGUS_DOCTYPE_STATE;
3331     } else {
3332    
3333     $self->{state} = BOGUS_MD_STATE;
3334     }
3335 wakaba 1.1
3336    
3337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3338     $self->{line_prev} = $self->{line};
3339     $self->{column_prev} = $self->{column};
3340     $self->{column}++;
3341     $self->{nc}
3342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3343     } else {
3344     $self->{set_nc}->($self);
3345     }
3346    
3347     redo A;
3348     }
3349     } elsif ($self->{state} == PUBLIC_STATE) {
3350     ## ASCII case-insensitive
3351     if ($self->{nc} == [
3352     undef,
3353     0x0055, # U
3354     0x0042, # B
3355     0x004C, # L
3356     0x0049, # I
3357 wakaba 1.12 ]->[length $self->{kwd}] or
3358 wakaba 1.1 $self->{nc} == [
3359     undef,
3360     0x0075, # u
3361     0x0062, # b
3362     0x006C, # l
3363     0x0069, # i
3364 wakaba 1.12 ]->[length $self->{kwd}]) {
3365 wakaba 1.1
3366     ## Stay in the state.
3367 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3368 wakaba 1.1
3369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370     $self->{line_prev} = $self->{line};
3371     $self->{column_prev} = $self->{column};
3372     $self->{column}++;
3373     $self->{nc}
3374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375     } else {
3376     $self->{set_nc}->($self);
3377     }
3378    
3379     redo A;
3380 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3381 wakaba 1.1 ($self->{nc} == 0x0043 or # C
3382     $self->{nc} == 0x0063)) { # c
3383 wakaba 1.12 if ($self->{is_xml} and
3384     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3385    
3386     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3387     text => 'PUBLIC',
3388     line => $self->{line_prev},
3389     column => $self->{column_prev} - 4);
3390     } else {
3391    
3392     }
3393 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3394    
3395     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396     $self->{line_prev} = $self->{line};
3397     $self->{column_prev} = $self->{column};
3398     $self->{column}++;
3399     $self->{nc}
3400     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401     } else {
3402     $self->{set_nc}->($self);
3403     }
3404    
3405     redo A;
3406     } else {
3407 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3408 wakaba 1.1 line => $self->{line_prev},
3409 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3410 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3411    
3412     $self->{ct}->{quirks} = 1;
3413     $self->{state} = BOGUS_DOCTYPE_STATE;
3414     } else {
3415    
3416     $self->{state} = BOGUS_MD_STATE;
3417     }
3418 wakaba 1.1 ## Reconsume.
3419     redo A;
3420     }
3421     } elsif ($self->{state} == SYSTEM_STATE) {
3422     ## ASCII case-insensitive
3423     if ($self->{nc} == [
3424     undef,
3425     0x0059, # Y
3426     0x0053, # S
3427     0x0054, # T
3428     0x0045, # E
3429 wakaba 1.12 ]->[length $self->{kwd}] or
3430 wakaba 1.1 $self->{nc} == [
3431     undef,
3432     0x0079, # y
3433     0x0073, # s
3434     0x0074, # t
3435     0x0065, # e
3436 wakaba 1.12 ]->[length $self->{kwd}]) {
3437 wakaba 1.1
3438     ## Stay in the state.
3439 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
3440 wakaba 1.1
3441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3442     $self->{line_prev} = $self->{line};
3443     $self->{column_prev} = $self->{column};
3444     $self->{column}++;
3445     $self->{nc}
3446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3447     } else {
3448     $self->{set_nc}->($self);
3449     }
3450    
3451     redo A;
3452 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
3453 wakaba 1.1 ($self->{nc} == 0x004D or # M
3454     $self->{nc} == 0x006D)) { # m
3455 wakaba 1.12 if ($self->{is_xml} and
3456     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3457    
3458     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3459     text => 'SYSTEM',
3460     line => $self->{line_prev},
3461     column => $self->{column_prev} - 4);
3462     } else {
3463    
3464     }
3465 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3466    
3467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3468     $self->{line_prev} = $self->{line};
3469     $self->{column_prev} = $self->{column};
3470     $self->{column}++;
3471     $self->{nc}
3472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3473     } else {
3474     $self->{set_nc}->($self);
3475     }
3476    
3477     redo A;
3478     } else {
3479 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3480 wakaba 1.1 line => $self->{line_prev},
3481 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
3482 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3483    
3484     $self->{ct}->{quirks} = 1;
3485     $self->{state} = BOGUS_DOCTYPE_STATE;
3486     } else {
3487    
3488     $self->{state} = BOGUS_MD_STATE;
3489     }
3490 wakaba 1.1 ## Reconsume.
3491     redo A;
3492     }
3493     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3494     if ($is_space->{$self->{nc}}) {
3495    
3496     ## Stay in the state
3497    
3498     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3499     $self->{line_prev} = $self->{line};
3500     $self->{column_prev} = $self->{column};
3501     $self->{column}++;
3502     $self->{nc}
3503     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3504     } else {
3505     $self->{set_nc}->($self);
3506     }
3507    
3508     redo A;
3509     } elsif ($self->{nc} eq 0x0022) { # "
3510    
3511     $self->{ct}->{pubid} = ''; # DOCTYPE
3512     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3513    
3514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3515     $self->{line_prev} = $self->{line};
3516     $self->{column_prev} = $self->{column};
3517     $self->{column}++;
3518     $self->{nc}
3519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3520     } else {
3521     $self->{set_nc}->($self);
3522     }
3523    
3524     redo A;
3525     } elsif ($self->{nc} eq 0x0027) { # '
3526    
3527     $self->{ct}->{pubid} = ''; # DOCTYPE
3528     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3529    
3530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3531     $self->{line_prev} = $self->{line};
3532     $self->{column_prev} = $self->{column};
3533     $self->{column}++;
3534     $self->{nc}
3535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3536     } else {
3537     $self->{set_nc}->($self);
3538     }
3539    
3540     redo A;
3541     } elsif ($self->{nc} eq 0x003E) { # >
3542 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3543    
3544     if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3545    
3546     $self->{state} = DATA_STATE;
3547     $self->{s_kwd} = '';
3548     $self->{ct}->{quirks} = 1;
3549     } else {
3550    
3551     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3552     }
3553 wakaba 1.1
3554    
3555     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3556     $self->{line_prev} = $self->{line};
3557     $self->{column_prev} = $self->{column};
3558     $self->{column}++;
3559     $self->{nc}
3560     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3561     } else {
3562     $self->{set_nc}->($self);
3563     }
3564    
3565 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3566 wakaba 1.1 redo A;
3567     } elsif ($self->{nc} == -1) {
3568 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3569    
3570     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3571     $self->{state} = DATA_STATE;
3572     $self->{s_kwd} = '';
3573     $self->{ct}->{quirks} = 1;
3574     } else {
3575    
3576     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3577     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3578     }
3579 wakaba 1.1
3580     ## reconsume
3581     return ($self->{ct}); # DOCTYPE
3582     redo A;
3583 wakaba 1.16 } elsif ($self->{is_xml} and
3584     $self->{ct}->{type} == DOCTYPE_TOKEN and
3585     $self->{nc} == 0x005B) { # [
3586 wakaba 1.12
3587     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3588     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3589     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3590 wakaba 1.13 $self->{in_subset} = 1;
3591 wakaba 1.12
3592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3593     $self->{line_prev} = $self->{line};
3594     $self->{column_prev} = $self->{column};
3595     $self->{column}++;
3596     $self->{nc}
3597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3598     } else {
3599     $self->{set_nc}->($self);
3600     }
3601    
3602 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3603 wakaba 1.12 redo A;
3604 wakaba 1.1 } else {
3605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3606    
3607 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3608    
3609     $self->{ct}->{quirks} = 1;
3610     $self->{state} = BOGUS_DOCTYPE_STATE;
3611     } else {
3612    
3613     $self->{state} = BOGUS_MD_STATE;
3614     }
3615    
3616 wakaba 1.1
3617     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3618     $self->{line_prev} = $self->{line};
3619     $self->{column_prev} = $self->{column};
3620     $self->{column}++;
3621     $self->{nc}
3622     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3623     } else {
3624     $self->{set_nc}->($self);
3625     }
3626    
3627     redo A;
3628     }
3629     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3630     if ($self->{nc} == 0x0022) { # "
3631    
3632     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3633    
3634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3635     $self->{line_prev} = $self->{line};
3636     $self->{column_prev} = $self->{column};
3637     $self->{column}++;
3638     $self->{nc}
3639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3640     } else {
3641     $self->{set_nc}->($self);
3642     }
3643    
3644     redo A;
3645     } elsif ($self->{nc} == 0x003E) { # >
3646     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3647    
3648 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3649    
3650     $self->{state} = DATA_STATE;
3651     $self->{s_kwd} = '';
3652     $self->{ct}->{quirks} = 1;
3653     } else {
3654    
3655     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3656     }
3657    
3658 wakaba 1.1
3659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3660     $self->{line_prev} = $self->{line};
3661     $self->{column_prev} = $self->{column};
3662     $self->{column}++;
3663     $self->{nc}
3664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3665     } else {
3666     $self->{set_nc}->($self);
3667     }
3668    
3669 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3670 wakaba 1.1 redo A;
3671     } elsif ($self->{nc} == -1) {
3672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3673    
3674 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3675    
3676     $self->{state} = DATA_STATE;
3677     $self->{s_kwd} = '';
3678     $self->{ct}->{quirks} = 1;
3679     } else {
3680    
3681     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3682     }
3683    
3684     ## Reconsume.
3685 wakaba 1.1 return ($self->{ct}); # DOCTYPE
3686     redo A;
3687     } else {
3688    
3689 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3690 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3691     length $self->{ct}->{pubid});
3692    
3693     ## Stay in the state
3694    
3695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3696     $self->{line_prev} = $self->{line};
3697     $self->{column_prev} = $self->{column};
3698     $self->{column}++;
3699     $self->{nc}
3700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3701     } else {
3702     $self->{set_nc}->($self);
3703     }
3704    
3705     redo A;
3706     }
3707     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3708     if ($self->{nc} == 0x0027) { # '
3709    
3710     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3711    
3712     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3713     $self->{line_prev} = $self->{line};
3714     $self->{column_prev} = $self->{column};
3715     $self->{column}++;
3716     $self->{nc}
3717     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3718     } else {
3719     $self->{set_nc}->($self);
3720     }
3721    
3722     redo A;
3723     } elsif ($self->{nc} == 0x003E) { # >
3724     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3725    
3726 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3727    
3728     $self->{state} = DATA_STATE;
3729     $self->{s_kwd} = '';
3730     $self->{ct}->{quirks} = 1;
3731     } else {
3732    
3733     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3734     }
3735    
3736 wakaba 1.1
3737     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3738     $self->{line_prev} = $self->{line};
3739     $self->{column_prev} = $self->{column};
3740     $self->{column}++;
3741     $self->{nc}
3742     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3743     } else {
3744     $self->{set_nc}->($self);
3745     }
3746    
3747 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3748 wakaba 1.1 redo A;
3749     } elsif ($self->{nc} == -1) {
3750     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3751    
3752 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3753    
3754     $self->{state} = DATA_STATE;
3755     $self->{s_kwd} = '';
3756     $self->{ct}->{quirks} = 1;
3757     } else {
3758    
3759     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3760     }
3761    
3762 wakaba 1.1 ## reconsume
3763 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3764 wakaba 1.1 redo A;
3765     } else {
3766    
3767 wakaba 1.16 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3768 wakaba 1.1 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3769     length $self->{ct}->{pubid});
3770    
3771     ## Stay in the state
3772    
3773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3774     $self->{line_prev} = $self->{line};
3775     $self->{column_prev} = $self->{column};
3776     $self->{column}++;
3777     $self->{nc}
3778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3779     } else {
3780     $self->{set_nc}->($self);
3781     }
3782    
3783     redo A;
3784     }
3785     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3786     if ($is_space->{$self->{nc}}) {
3787    
3788     ## Stay in the state
3789    
3790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3791     $self->{line_prev} = $self->{line};
3792     $self->{column_prev} = $self->{column};
3793     $self->{column}++;
3794     $self->{nc}
3795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3796     } else {
3797     $self->{set_nc}->($self);
3798     }
3799    
3800     redo A;
3801     } elsif ($self->{nc} == 0x0022) { # "
3802    
3803 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3804 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3805    
3806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3807     $self->{line_prev} = $self->{line};
3808     $self->{column_prev} = $self->{column};
3809     $self->{column}++;
3810     $self->{nc}
3811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3812     } else {
3813     $self->{set_nc}->($self);
3814     }
3815    
3816     redo A;
3817     } elsif ($self->{nc} == 0x0027) { # '
3818    
3819 wakaba 1.16 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3820 wakaba 1.1 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3821    
3822     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3823     $self->{line_prev} = $self->{line};
3824     $self->{column_prev} = $self->{column};
3825     $self->{column}++;
3826     $self->{nc}
3827     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3828     } else {
3829     $self->{set_nc}->($self);
3830     }
3831    
3832     redo A;
3833     } elsif ($self->{nc} == 0x003E) { # >
3834 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3835     if ($self->{is_xml}) {
3836    
3837     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3838     } else {
3839    
3840     }
3841     $self->{state} = DATA_STATE;
3842     $self->{s_kwd} = '';
3843 wakaba 1.12 } else {
3844 wakaba 1.16 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3845    
3846     } else {
3847    
3848     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3849     }
3850     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3851 wakaba 1.12 }
3852 wakaba 1.16
3853 wakaba 1.1
3854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3855     $self->{line_prev} = $self->{line};
3856     $self->{column_prev} = $self->{column};
3857     $self->{column}++;
3858     $self->{nc}
3859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3860     } else {
3861     $self->{set_nc}->($self);
3862     }
3863    
3864 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3865 wakaba 1.1 redo A;
3866     } elsif ($self->{nc} == -1) {
3867 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3868    
3869     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3870    
3871     $self->{state} = DATA_STATE;
3872     $self->{s_kwd} = '';
3873     $self->{ct}->{quirks} = 1;
3874     } else {
3875     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3876     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3877     }
3878 wakaba 1.1
3879     ## reconsume
3880 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3881 wakaba 1.1 redo A;
3882 wakaba 1.16 } elsif ($self->{is_xml} and
3883     $self->{ct}->{type} == DOCTYPE_TOKEN and
3884     $self->{nc} == 0x005B) { # [
3885 wakaba 1.12
3886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3887     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3888     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3889 wakaba 1.13 $self->{in_subset} = 1;
3890 wakaba 1.12
3891     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3892     $self->{line_prev} = $self->{line};
3893     $self->{column_prev} = $self->{column};
3894     $self->{column}++;
3895     $self->{nc}
3896     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3897     } else {
3898     $self->{set_nc}->($self);
3899     }
3900    
3901 wakaba 1.13 return ($self->{ct}); # DOCTYPE
3902 wakaba 1.12 redo A;
3903 wakaba 1.1 } else {
3904     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3905    
3906 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3907    
3908     $self->{ct}->{quirks} = 1;
3909     $self->{state} = BOGUS_DOCTYPE_STATE;
3910     } else {
3911    
3912     $self->{state} = BOGUS_MD_STATE;
3913     }
3914    
3915 wakaba 1.1
3916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3917     $self->{line_prev} = $self->{line};
3918     $self->{column_prev} = $self->{column};
3919     $self->{column}++;
3920     $self->{nc}
3921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3922     } else {
3923     $self->{set_nc}->($self);
3924     }
3925    
3926     redo A;
3927     }
3928     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3929     if ($is_space->{$self->{nc}}) {
3930    
3931     ## Stay in the state
3932    
3933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3934     $self->{line_prev} = $self->{line};
3935     $self->{column_prev} = $self->{column};
3936     $self->{column}++;
3937     $self->{nc}
3938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3939     } else {
3940     $self->{set_nc}->($self);
3941     }
3942    
3943     redo A;
3944     } elsif ($self->{nc} == 0x0022) { # "
3945    
3946     $self->{ct}->{sysid} = ''; # DOCTYPE
3947     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3948    
3949     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3950     $self->{line_prev} = $self->{line};
3951     $self->{column_prev} = $self->{column};
3952     $self->{column}++;
3953     $self->{nc}
3954     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3955     } else {
3956     $self->{set_nc}->($self);
3957     }
3958    
3959     redo A;
3960     } elsif ($self->{nc} == 0x0027) { # '
3961    
3962     $self->{ct}->{sysid} = ''; # DOCTYPE
3963     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3964    
3965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3966     $self->{line_prev} = $self->{line};
3967     $self->{column_prev} = $self->{column};
3968     $self->{column}++;
3969     $self->{nc}
3970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3971     } else {
3972     $self->{set_nc}->($self);
3973     }
3974    
3975     redo A;
3976     } elsif ($self->{nc} == 0x003E) { # >
3977     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3978    
3979     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3980     $self->{line_prev} = $self->{line};
3981     $self->{column_prev} = $self->{column};
3982     $self->{column}++;
3983     $self->{nc}
3984     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3985     } else {
3986     $self->{set_nc}->($self);
3987     }
3988    
3989    
3990 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3991    
3992     $self->{state} = DATA_STATE;
3993     $self->{s_kwd} = '';
3994     $self->{ct}->{quirks} = 1;
3995     } else {
3996    
3997     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3998     }
3999 wakaba 1.1
4000 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4001 wakaba 1.1 redo A;
4002     } elsif ($self->{nc} == -1) {
4003 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4004    
4005     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4006     $self->{state} = DATA_STATE;
4007     $self->{s_kwd} = '';
4008     $self->{ct}->{quirks} = 1;
4009     } else {
4010    
4011     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4012     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4013     }
4014 wakaba 1.1
4015     ## reconsume
4016 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4017 wakaba 1.1 redo A;
4018 wakaba 1.16 } elsif ($self->{is_xml} and
4019     $self->{ct}->{type} == DOCTYPE_TOKEN and
4020     $self->{nc} == 0x005B) { # [
4021 wakaba 1.12
4022     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4023    
4024     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4025     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4026 wakaba 1.13 $self->{in_subset} = 1;
4027 wakaba 1.12
4028     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4029     $self->{line_prev} = $self->{line};
4030     $self->{column_prev} = $self->{column};
4031     $self->{column}++;
4032     $self->{nc}
4033     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4034     } else {
4035     $self->{set_nc}->($self);
4036     }
4037    
4038 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4039 wakaba 1.12 redo A;
4040 wakaba 1.1 } else {
4041     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4042    
4043 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4044    
4045     $self->{ct}->{quirks} = 1;
4046     $self->{state} = BOGUS_DOCTYPE_STATE;
4047     } else {
4048    
4049     $self->{state} = BOGUS_MD_STATE;
4050     }
4051    
4052 wakaba 1.1
4053     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4054     $self->{line_prev} = $self->{line};
4055     $self->{column_prev} = $self->{column};
4056     $self->{column}++;
4057     $self->{nc}
4058     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4059     } else {
4060     $self->{set_nc}->($self);
4061     }
4062    
4063     redo A;
4064     }
4065     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4066     if ($self->{nc} == 0x0022) { # "
4067    
4068     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4069    
4070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4071     $self->{line_prev} = $self->{line};
4072     $self->{column_prev} = $self->{column};
4073     $self->{column}++;
4074     $self->{nc}
4075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4076     } else {
4077     $self->{set_nc}->($self);
4078     }
4079    
4080     redo A;
4081 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4082 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4083    
4084 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085    
4086     $self->{state} = DATA_STATE;
4087     $self->{s_kwd} = '';
4088     $self->{ct}->{quirks} = 1;
4089     } else {
4090    
4091     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4092     }
4093    
4094 wakaba 1.1
4095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4096     $self->{line_prev} = $self->{line};
4097     $self->{column_prev} = $self->{column};
4098     $self->{column}++;
4099     $self->{nc}
4100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4101     } else {
4102     $self->{set_nc}->($self);
4103     }
4104    
4105 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4106 wakaba 1.1 redo A;
4107     } elsif ($self->{nc} == -1) {
4108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4109    
4110 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4111    
4112     $self->{state} = DATA_STATE;
4113     $self->{s_kwd} = '';
4114     $self->{ct}->{quirks} = 1;
4115     } else {
4116    
4117     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4118     }
4119    
4120 wakaba 1.1 ## reconsume
4121 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4122 wakaba 1.1 redo A;
4123     } else {
4124    
4125 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4126 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4127     length $self->{ct}->{sysid});
4128    
4129     ## Stay in the state
4130    
4131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4132     $self->{line_prev} = $self->{line};
4133     $self->{column_prev} = $self->{column};
4134     $self->{column}++;
4135     $self->{nc}
4136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4137     } else {
4138     $self->{set_nc}->($self);
4139     }
4140    
4141     redo A;
4142     }
4143     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4144     if ($self->{nc} == 0x0027) { # '
4145    
4146     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4147    
4148     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4149     $self->{line_prev} = $self->{line};
4150     $self->{column_prev} = $self->{column};
4151     $self->{column}++;
4152     $self->{nc}
4153     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4154     } else {
4155     $self->{set_nc}->($self);
4156     }
4157    
4158     redo A;
4159 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4160 wakaba 1.1
4161     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4162    
4163     $self->{state} = DATA_STATE;
4164 wakaba 1.5 $self->{s_kwd} = '';
4165 wakaba 1.1
4166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4167     $self->{line_prev} = $self->{line};
4168     $self->{column_prev} = $self->{column};
4169     $self->{column}++;
4170     $self->{nc}
4171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4172     } else {
4173     $self->{set_nc}->($self);
4174     }
4175    
4176    
4177     $self->{ct}->{quirks} = 1;
4178     return ($self->{ct}); # DOCTYPE
4179    
4180     redo A;
4181     } elsif ($self->{nc} == -1) {
4182     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4183    
4184 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4185    
4186     $self->{state} = DATA_STATE;
4187     $self->{s_kwd} = '';
4188     $self->{ct}->{quirks} = 1;
4189     } else {
4190    
4191     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4192     }
4193    
4194 wakaba 1.1 ## reconsume
4195 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4196 wakaba 1.1 redo A;
4197     } else {
4198    
4199 wakaba 1.16 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4200 wakaba 1.1 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4201     length $self->{ct}->{sysid});
4202    
4203     ## Stay in the state
4204    
4205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4206     $self->{line_prev} = $self->{line};
4207     $self->{column_prev} = $self->{column};
4208     $self->{column}++;
4209     $self->{nc}
4210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4211     } else {
4212     $self->{set_nc}->($self);
4213     }
4214    
4215     redo A;
4216     }
4217     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4218     if ($is_space->{$self->{nc}}) {
4219 wakaba 1.18 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4220    
4221     $self->{state} = BEFORE_NDATA_STATE;
4222     } else {
4223    
4224     ## Stay in the state
4225     }
4226 wakaba 1.1
4227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4228     $self->{line_prev} = $self->{line};
4229     $self->{column_prev} = $self->{column};
4230     $self->{column}++;
4231     $self->{nc}
4232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4233     } else {
4234     $self->{set_nc}->($self);
4235     }
4236    
4237     redo A;
4238     } elsif ($self->{nc} == 0x003E) { # >
4239 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4240    
4241     $self->{state} = DATA_STATE;
4242     $self->{s_kwd} = '';
4243     } else {
4244    
4245     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4246     }
4247    
4248 wakaba 1.1
4249     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4250     $self->{line_prev} = $self->{line};
4251     $self->{column_prev} = $self->{column};
4252     $self->{column}++;
4253     $self->{nc}
4254     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4255     } else {
4256     $self->{set_nc}->($self);
4257     }
4258    
4259 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4260 wakaba 1.1 redo A;
4261 wakaba 1.18 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4262     ($self->{nc} == 0x004E or # N
4263     $self->{nc} == 0x006E)) { # n
4264    
4265     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4266     $self->{state} = NDATA_STATE;
4267     $self->{kwd} = chr $self->{nc};
4268    
4269     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4270     $self->{line_prev} = $self->{line};
4271     $self->{column_prev} = $self->{column};
4272     $self->{column}++;
4273     $self->{nc}
4274     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4275     } else {
4276     $self->{set_nc}->($self);
4277     }
4278    
4279     redo A;
4280 wakaba 1.1 } elsif ($self->{nc} == -1) {
4281 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282    
4283     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4284     $self->{state} = DATA_STATE;
4285     $self->{s_kwd} = '';
4286     $self->{ct}->{quirks} = 1;
4287     } else {
4288    
4289     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4290     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4291     }
4292    
4293 wakaba 1.1 ## reconsume
4294 wakaba 1.16 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4295 wakaba 1.1 redo A;
4296 wakaba 1.16 } elsif ($self->{is_xml} and
4297     $self->{ct}->{type} == DOCTYPE_TOKEN and
4298     $self->{nc} == 0x005B) { # [
4299 wakaba 1.12
4300     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4301     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4302 wakaba 1.13 $self->{in_subset} = 1;
4303 wakaba 1.12
4304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4305     $self->{line_prev} = $self->{line};
4306     $self->{column_prev} = $self->{column};
4307     $self->{column}++;
4308     $self->{nc}
4309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4310     } else {
4311     $self->{set_nc}->($self);
4312     }
4313    
4314 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4315 wakaba 1.12 redo A;
4316 wakaba 1.1 } else {
4317     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4318    
4319 wakaba 1.16 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4320    
4321     #$self->{ct}->{quirks} = 1;
4322     $self->{state} = BOGUS_DOCTYPE_STATE;
4323     } else {
4324    
4325     $self->{state} = BOGUS_MD_STATE;
4326     }
4327    
4328 wakaba 1.1
4329     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4330     $self->{line_prev} = $self->{line};
4331     $self->{column_prev} = $self->{column};
4332     $self->{column}++;
4333     $self->{nc}
4334     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4335     } else {
4336     $self->{set_nc}->($self);
4337     }
4338    
4339     redo A;
4340     }
4341 wakaba 1.18 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4342     if ($is_space->{$self->{nc}}) {
4343    
4344     ## Stay in the state.
4345    
4346     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4347     $self->{line_prev} = $self->{line};
4348     $self->{column_prev} = $self->{column};
4349     $self->{column}++;
4350     $self->{nc}
4351     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4352     } else {
4353     $self->{set_nc}->($self);
4354     }
4355    
4356     redo A;
4357     } elsif ($self->{nc} == 0x003E) { # >
4358    
4359     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4360    
4361     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4362     $self->{line_prev} = $self->{line};
4363     $self->{column_prev} = $self->{column};
4364     $self->{column}++;
4365     $self->{nc}
4366     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4367     } else {
4368     $self->{set_nc}->($self);
4369     }
4370    
4371     return ($self->{ct}); # ENTITY
4372     redo A;
4373     } elsif ($self->{nc} == 0x004E or # N
4374     $self->{nc} == 0x006E) { # n
4375    
4376     $self->{state} = NDATA_STATE;
4377     $self->{kwd} = chr $self->{nc};
4378    
4379     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4380     $self->{line_prev} = $self->{line};
4381     $self->{column_prev} = $self->{column};
4382     $self->{column}++;
4383     $self->{nc}
4384     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4385     } else {
4386     $self->{set_nc}->($self);
4387     }
4388    
4389     redo A;
4390     } elsif ($self->{nc} == -1) {
4391    
4392     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4393     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4394     ## reconsume
4395     return ($self->{ct}); # ENTITY
4396     redo A;
4397     } else {
4398    
4399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4400     $self->{state} = BOGUS_MD_STATE;
4401    
4402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403     $self->{line_prev} = $self->{line};
4404     $self->{column_prev} = $self->{column};
4405     $self->{column}++;
4406     $self->{nc}
4407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408     } else {
4409     $self->{set_nc}->($self);
4410     }
4411    
4412     redo A;
4413     }
4414 wakaba 1.1 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4415     if ($self->{nc} == 0x003E) { # >
4416    
4417     $self->{state} = DATA_STATE;
4418 wakaba 1.5 $self->{s_kwd} = '';
4419 wakaba 1.1
4420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421     $self->{line_prev} = $self->{line};
4422     $self->{column_prev} = $self->{column};
4423     $self->{column}++;
4424     $self->{nc}
4425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4426     } else {
4427     $self->{set_nc}->($self);
4428     }
4429    
4430    
4431     return ($self->{ct}); # DOCTYPE
4432    
4433     redo A;
4434 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4435 wakaba 1.13
4436     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4437     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4438     $self->{in_subset} = 1;
4439    
4440 wakaba 1.12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4441     $self->{line_prev} = $self->{line};
4442     $self->{column_prev} = $self->{column};
4443     $self->{column}++;
4444     $self->{nc}
4445     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4446     } else {
4447     $self->{set_nc}->($self);
4448     }
4449    
4450 wakaba 1.13 return ($self->{ct}); # DOCTYPE
4451     redo A;
4452 wakaba 1.1 } elsif ($self->{nc} == -1) {
4453    
4454     $self->{state} = DATA_STATE;
4455 wakaba 1.5 $self->{s_kwd} = '';
4456 wakaba 1.1 ## reconsume
4457    
4458     return ($self->{ct}); # DOCTYPE
4459    
4460     redo A;
4461     } else {
4462    
4463     my $s = '';
4464 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
4465 wakaba 1.1
4466     ## Stay in the state
4467    
4468     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4469     $self->{line_prev} = $self->{line};
4470     $self->{column_prev} = $self->{column};
4471     $self->{column}++;
4472     $self->{nc}
4473     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4474     } else {
4475     $self->{set_nc}->($self);
4476     }
4477    
4478     redo A;
4479     }
4480     } elsif ($self->{state} == CDATA_SECTION_STATE) {
4481     ## NOTE: "CDATA section state" in the state is jointly implemented
4482     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4483     ## and |CDATA_SECTION_MSE2_STATE|.
4484 wakaba 1.10
4485     ## XML5: "CDATA state".
4486 wakaba 1.1
4487     if ($self->{nc} == 0x005D) { # ]
4488    
4489     $self->{state} = CDATA_SECTION_MSE1_STATE;
4490    
4491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4492     $self->{line_prev} = $self->{line};
4493     $self->{column_prev} = $self->{column};
4494     $self->{column}++;
4495     $self->{nc}
4496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4497     } else {
4498     $self->{set_nc}->($self);
4499     }
4500    
4501     redo A;
4502     } elsif ($self->{nc} == -1) {
4503 wakaba 1.6 if ($self->{is_xml}) {
4504 wakaba 1.8
4505 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4506 wakaba 1.8 } else {
4507    
4508 wakaba 1.6 }
4509    
4510 wakaba 1.1 $self->{state} = DATA_STATE;
4511 wakaba 1.5 $self->{s_kwd} = '';
4512 wakaba 1.10 ## Reconsume.
4513 wakaba 1.1 if (length $self->{ct}->{data}) { # character
4514    
4515     return ($self->{ct}); # character
4516     } else {
4517    
4518     ## No token to emit. $self->{ct} is discarded.
4519     }
4520     redo A;
4521     } else {
4522    
4523     $self->{ct}->{data} .= chr $self->{nc};
4524     $self->{read_until}->($self->{ct}->{data},
4525     q<]>,
4526     length $self->{ct}->{data});
4527    
4528     ## Stay in the state.
4529    
4530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4531     $self->{line_prev} = $self->{line};
4532     $self->{column_prev} = $self->{column};
4533     $self->{column}++;
4534     $self->{nc}
4535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4536     } else {
4537     $self->{set_nc}->($self);
4538     }
4539    
4540     redo A;
4541     }
4542    
4543     ## ISSUE: "text tokens" in spec.
4544     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4545 wakaba 1.10 ## XML5: "CDATA bracket state".
4546    
4547 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
4548    
4549     $self->{state} = CDATA_SECTION_MSE2_STATE;
4550    
4551     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4552     $self->{line_prev} = $self->{line};
4553     $self->{column_prev} = $self->{column};
4554     $self->{column}++;
4555     $self->{nc}
4556     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4557     } else {
4558     $self->{set_nc}->($self);
4559     }
4560    
4561     redo A;
4562     } else {
4563    
4564 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
4565 wakaba 1.1 $self->{ct}->{data} .= ']';
4566 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4567 wakaba 1.1 ## Reconsume.
4568     redo A;
4569     }
4570     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4571 wakaba 1.10 ## XML5: "CDATA end state".
4572    
4573 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
4574     $self->{state} = DATA_STATE;
4575 wakaba 1.5 $self->{s_kwd} = '';
4576 wakaba 1.1
4577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4578     $self->{line_prev} = $self->{line};
4579     $self->{column_prev} = $self->{column};
4580     $self->{column}++;
4581     $self->{nc}
4582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4583     } else {
4584     $self->{set_nc}->($self);
4585     }
4586    
4587     if (length $self->{ct}->{data}) { # character
4588    
4589     return ($self->{ct}); # character
4590     } else {
4591    
4592     ## No token to emit. $self->{ct} is discarded.
4593     }
4594     redo A;
4595     } elsif ($self->{nc} == 0x005D) { # ]
4596     # character
4597     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4598     ## Stay in the state.
4599    
4600     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4601     $self->{line_prev} = $self->{line};
4602     $self->{column_prev} = $self->{column};
4603     $self->{column}++;
4604     $self->{nc}
4605     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4606     } else {
4607     $self->{set_nc}->($self);
4608     }
4609    
4610     redo A;
4611     } else {
4612    
4613     $self->{ct}->{data} .= ']]'; # character
4614     $self->{state} = CDATA_SECTION_STATE;
4615 wakaba 1.10 ## Reconsume. ## XML5: Emit.
4616 wakaba 1.1 redo A;
4617     }
4618     } elsif ($self->{state} == ENTITY_STATE) {
4619     if ($is_space->{$self->{nc}} or
4620     {
4621     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4622     $self->{entity_add} => 1,
4623     }->{$self->{nc}}) {
4624    
4625     ## Don't consume
4626     ## No error
4627     ## Return nothing.
4628     #
4629     } elsif ($self->{nc} == 0x0023) { # #
4630    
4631     $self->{state} = ENTITY_HASH_STATE;
4632 wakaba 1.12 $self->{kwd} = '#';
4633 wakaba 1.1
4634     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4635     $self->{line_prev} = $self->{line};
4636     $self->{column_prev} = $self->{column};
4637     $self->{column}++;
4638     $self->{nc}
4639     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4640     } else {
4641     $self->{set_nc}->($self);
4642     }
4643    
4644     redo A;
4645     } elsif ((0x0041 <= $self->{nc} and
4646     $self->{nc} <= 0x005A) or # A..Z
4647     (0x0061 <= $self->{nc} and
4648     $self->{nc} <= 0x007A)) { # a..z
4649    
4650     require Whatpm::_NamedEntityList;
4651     $self->{state} = ENTITY_NAME_STATE;
4652 wakaba 1.12 $self->{kwd} = chr $self->{nc};
4653     $self->{entity__value} = $self->{kwd};
4654 wakaba 1.1 $self->{entity__match} = 0;
4655    
4656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4657     $self->{line_prev} = $self->{line};
4658     $self->{column_prev} = $self->{column};
4659     $self->{column}++;
4660     $self->{nc}
4661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4662     } else {
4663     $self->{set_nc}->($self);
4664     }
4665    
4666     redo A;
4667     } else {
4668    
4669     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4670     ## Return nothing.
4671     #
4672     }
4673    
4674     ## NOTE: No character is consumed by the "consume a character
4675     ## reference" algorithm. In other word, there is an "&" character
4676     ## that does not introduce a character reference, which would be
4677     ## appended to the parent element or the attribute value in later
4678     ## process of the tokenizer.
4679    
4680     if ($self->{prev_state} == DATA_STATE) {
4681    
4682     $self->{state} = $self->{prev_state};
4683 wakaba 1.5 $self->{s_kwd} = '';
4684 wakaba 1.1 ## Reconsume.
4685     return ({type => CHARACTER_TOKEN, data => '&',
4686     line => $self->{line_prev},
4687     column => $self->{column_prev},
4688     });
4689     redo A;
4690     } else {
4691    
4692     $self->{ca}->{value} .= '&';
4693     $self->{state} = $self->{prev_state};
4694 wakaba 1.5 $self->{s_kwd} = '';
4695 wakaba 1.1 ## Reconsume.
4696     redo A;
4697     }
4698     } elsif ($self->{state} == ENTITY_HASH_STATE) {
4699     if ($self->{nc} == 0x0078 or # x
4700     $self->{nc} == 0x0058) { # X
4701    
4702     $self->{state} = HEXREF_X_STATE;
4703 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
4704 wakaba 1.1
4705     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4706     $self->{line_prev} = $self->{line};
4707     $self->{column_prev} = $self->{column};
4708     $self->{column}++;
4709     $self->{nc}
4710     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4711     } else {
4712     $self->{set_nc}->($self);
4713     }
4714    
4715     redo A;
4716     } elsif (0x0030 <= $self->{nc} and
4717     $self->{nc} <= 0x0039) { # 0..9
4718    
4719     $self->{state} = NCR_NUM_STATE;
4720 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
4721 wakaba 1.1
4722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4723     $self->{line_prev} = $self->{line};
4724     $self->{column_prev} = $self->{column};
4725     $self->{column}++;
4726     $self->{nc}
4727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4728     } else {
4729     $self->{set_nc}->($self);
4730     }
4731    
4732     redo A;
4733     } else {
4734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4735     line => $self->{line_prev},
4736     column => $self->{column_prev} - 1);
4737    
4738     ## NOTE: According to the spec algorithm, nothing is returned,
4739     ## and then "&#" is appended to the parent element or the attribute
4740     ## value in the later processing.
4741    
4742     if ($self->{prev_state} == DATA_STATE) {
4743    
4744     $self->{state} = $self->{prev_state};
4745 wakaba 1.5 $self->{s_kwd} = '';
4746 wakaba 1.1 ## Reconsume.
4747     return ({type => CHARACTER_TOKEN,
4748     data => '&#',
4749     line => $self->{line_prev},
4750     column => $self->{column_prev} - 1,
4751     });
4752     redo A;
4753     } else {
4754    
4755     $self->{ca}->{value} .= '&#';
4756     $self->{state} = $self->{prev_state};
4757 wakaba 1.5 $self->{s_kwd} = '';
4758 wakaba 1.1 ## Reconsume.
4759     redo A;
4760     }
4761     }
4762     } elsif ($self->{state} == NCR_NUM_STATE) {
4763     if (0x0030 <= $self->{nc} and
4764     $self->{nc} <= 0x0039) { # 0..9
4765    
4766 wakaba 1.12 $self->{kwd} *= 10;
4767     $self->{kwd} += $self->{nc} - 0x0030;
4768 wakaba 1.1
4769     ## Stay in the state.
4770    
4771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4772     $self->{line_prev} = $self->{line};
4773     $self->{column_prev} = $self->{column};
4774     $self->{column}++;
4775     $self->{nc}
4776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4777     } else {
4778     $self->{set_nc}->($self);
4779     }
4780    
4781     redo A;
4782     } elsif ($self->{nc} == 0x003B) { # ;
4783    
4784    
4785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4786     $self->{line_prev} = $self->{line};
4787     $self->{column_prev} = $self->{column};
4788     $self->{column}++;
4789     $self->{nc}
4790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4791     } else {
4792     $self->{set_nc}->($self);
4793     }
4794    
4795     #
4796     } else {
4797    
4798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4799     ## Reconsume.
4800     #
4801     }
4802    
4803 wakaba 1.12 my $code = $self->{kwd};
4804 wakaba 1.1 my $l = $self->{line_prev};
4805     my $c = $self->{column_prev};
4806     if ($charref_map->{$code}) {
4807    
4808     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4809     text => (sprintf 'U+%04X', $code),
4810     line => $l, column => $c);
4811     $code = $charref_map->{$code};
4812     } elsif ($code > 0x10FFFF) {
4813    
4814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4815     text => (sprintf 'U-%08X', $code),
4816     line => $l, column => $c);
4817     $code = 0xFFFD;
4818     }
4819    
4820     if ($self->{prev_state} == DATA_STATE) {
4821    
4822     $self->{state} = $self->{prev_state};
4823 wakaba 1.5 $self->{s_kwd} = '';
4824 wakaba 1.1 ## Reconsume.
4825     return ({type => CHARACTER_TOKEN, data => chr $code,
4826 wakaba 1.7 has_reference => 1,
4827 wakaba 1.1 line => $l, column => $c,
4828     });
4829     redo A;
4830     } else {
4831    
4832     $self->{ca}->{value} .= chr $code;
4833     $self->{ca}->{has_reference} = 1;
4834     $self->{state} = $self->{prev_state};
4835 wakaba 1.5 $self->{s_kwd} = '';
4836 wakaba 1.1 ## Reconsume.
4837     redo A;
4838     }
4839     } elsif ($self->{state} == HEXREF_X_STATE) {
4840     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4841     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4842     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4843     # 0..9, A..F, a..f
4844    
4845     $self->{state} = HEXREF_HEX_STATE;
4846 wakaba 1.12 $self->{kwd} = 0;
4847 wakaba 1.1 ## Reconsume.
4848     redo A;
4849     } else {
4850     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4851     line => $self->{line_prev},
4852     column => $self->{column_prev} - 2);
4853    
4854     ## NOTE: According to the spec algorithm, nothing is returned,
4855     ## and then "&#" followed by "X" or "x" is appended to the parent
4856     ## element or the attribute value in the later processing.
4857    
4858     if ($self->{prev_state} == DATA_STATE) {
4859    
4860     $self->{state} = $self->{prev_state};
4861 wakaba 1.5 $self->{s_kwd} = '';
4862 wakaba 1.1 ## Reconsume.
4863     return ({type => CHARACTER_TOKEN,
4864 wakaba 1.12 data => '&' . $self->{kwd},
4865 wakaba 1.1 line => $self->{line_prev},
4866 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
4867 wakaba 1.1 });
4868     redo A;
4869     } else {
4870    
4871 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
4872 wakaba 1.1 $self->{state} = $self->{prev_state};
4873 wakaba 1.5 $self->{s_kwd} = '';
4874 wakaba 1.1 ## Reconsume.
4875     redo A;
4876     }
4877     }
4878     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4879     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4880     # 0..9
4881    
4882 wakaba 1.12 $self->{kwd} *= 0x10;
4883     $self->{kwd} += $self->{nc} - 0x0030;
4884 wakaba 1.1 ## Stay in the state.
4885    
4886     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4887     $self->{line_prev} = $self->{line};
4888     $self->{column_prev} = $self->{column};
4889     $self->{column}++;
4890     $self->{nc}
4891     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4892     } else {
4893     $self->{set_nc}->($self);
4894     }
4895    
4896     redo A;
4897     } elsif (0x0061 <= $self->{nc} and
4898     $self->{nc} <= 0x0066) { # a..f
4899    
4900 wakaba 1.12 $self->{kwd} *= 0x10;
4901     $self->{kwd} += $self->{nc} - 0x0060 + 9;
4902 wakaba 1.1 ## Stay in the state.
4903    
4904     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4905     $self->{line_prev} = $self->{line};
4906     $self->{column_prev} = $self->{column};
4907     $self->{column}++;
4908     $self->{nc}
4909     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4910     } else {
4911     $self->{set_nc}->($self);
4912     }
4913    
4914     redo A;
4915     } elsif (0x0041 <= $self->{nc} and
4916     $self->{nc} <= 0x0046) { # A..F
4917    
4918 wakaba 1.12 $self->{kwd} *= 0x10;
4919     $self->{kwd} += $self->{nc} - 0x0040 + 9;
4920 wakaba 1.1 ## Stay in the state.
4921    
4922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4923     $self->{line_prev} = $self->{line};
4924     $self->{column_prev} = $self->{column};
4925     $self->{column}++;
4926     $self->{nc}
4927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4928     } else {
4929     $self->{set_nc}->($self);
4930     }
4931    
4932     redo A;
4933     } elsif ($self->{nc} == 0x003B) { # ;
4934    
4935    
4936     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4937     $self->{line_prev} = $self->{line};
4938     $self->{column_prev} = $self->{column};
4939     $self->{column}++;
4940     $self->{nc}
4941     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4942     } else {
4943     $self->{set_nc}->($self);
4944     }
4945    
4946     #
4947     } else {
4948    
4949     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4950     line => $self->{line},
4951     column => $self->{column});
4952     ## Reconsume.
4953     #
4954     }
4955    
4956 wakaba 1.12 my $code = $self->{kwd};
4957 wakaba 1.1 my $l = $self->{line_prev};
4958     my $c = $self->{column_prev};
4959     if ($charref_map->{$code}) {
4960    
4961     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4962     text => (sprintf 'U+%04X', $code),
4963     line => $l, column => $c);
4964     $code = $charref_map->{$code};
4965     } elsif ($code > 0x10FFFF) {
4966    
4967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4968     text => (sprintf 'U-%08X', $code),
4969     line => $l, column => $c);
4970     $code = 0xFFFD;
4971     }
4972    
4973     if ($self->{prev_state} == DATA_STATE) {
4974    
4975     $self->{state} = $self->{prev_state};
4976 wakaba 1.5 $self->{s_kwd} = '';
4977 wakaba 1.1 ## Reconsume.
4978     return ({type => CHARACTER_TOKEN, data => chr $code,
4979 wakaba 1.7 has_reference => 1,
4980 wakaba 1.1 line => $l, column => $c,
4981     });
4982     redo A;
4983     } else {
4984    
4985     $self->{ca}->{value} .= chr $code;
4986     $self->{ca}->{has_reference} = 1;
4987     $self->{state} = $self->{prev_state};
4988 wakaba 1.5 $self->{s_kwd} = '';
4989 wakaba 1.1 ## Reconsume.
4990     redo A;
4991     }
4992     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4993 wakaba 1.12 if (length $self->{kwd} < 30 and
4994 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
4995     ((0x0041 <= $self->{nc} and # a
4996     $self->{nc} <= 0x005A) or # x
4997     (0x0061 <= $self->{nc} and # a
4998     $self->{nc} <= 0x007A) or # z
4999     (0x0030 <= $self->{nc} and # 0
5000     $self->{nc} <= 0x0039) or # 9
5001     $self->{nc} == 0x003B)) { # ;
5002     our $EntityChar;
5003 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
5004     if (defined $EntityChar->{$self->{kwd}}) {
5005 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
5006    
5007 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5008 wakaba 1.1 $self->{entity__match} = 1;
5009    
5010     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5011     $self->{line_prev} = $self->{line};
5012     $self->{column_prev} = $self->{column};
5013     $self->{column}++;
5014     $self->{nc}
5015     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5016     } else {
5017     $self->{set_nc}->($self);
5018     }
5019    
5020     #
5021     } else {
5022    
5023 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5024 wakaba 1.1 $self->{entity__match} = -1;
5025     ## Stay in the state.
5026    
5027     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5028     $self->{line_prev} = $self->{line};
5029     $self->{column_prev} = $self->{column};
5030     $self->{column}++;
5031     $self->{nc}
5032     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5033     } else {
5034     $self->{set_nc}->($self);
5035     }
5036    
5037     redo A;
5038     }
5039     } else {
5040    
5041     $self->{entity__value} .= chr $self->{nc};
5042     $self->{entity__match} *= 2;
5043     ## Stay in the state.
5044    
5045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5046     $self->{line_prev} = $self->{line};
5047     $self->{column_prev} = $self->{column};
5048     $self->{column}++;
5049     $self->{nc}
5050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5051     } else {
5052     $self->{set_nc}->($self);
5053     }
5054    
5055     redo A;
5056     }
5057     }
5058    
5059     my $data;
5060     my $has_ref;
5061     if ($self->{entity__match} > 0) {
5062    
5063     $data = $self->{entity__value};
5064     $has_ref = 1;
5065     #
5066     } elsif ($self->{entity__match} < 0) {
5067     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5068     if ($self->{prev_state} != DATA_STATE and # in attribute
5069     $self->{entity__match} < -1) {
5070    
5071 wakaba 1.12 $data = '&' . $self->{kwd};
5072 wakaba 1.1 #
5073     } else {
5074    
5075     $data = $self->{entity__value};
5076     $has_ref = 1;
5077     #
5078     }
5079     } else {
5080    
5081     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5082     line => $self->{line_prev},
5083 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
5084     $data = '&' . $self->{kwd};
5085 wakaba 1.1 #
5086     }
5087    
5088     ## NOTE: In these cases, when a character reference is found,
5089     ## it is consumed and a character token is returned, or, otherwise,
5090     ## nothing is consumed and returned, according to the spec algorithm.
5091     ## In this implementation, anything that has been examined by the
5092     ## tokenizer is appended to the parent element or the attribute value
5093     ## as string, either literal string when no character reference or
5094     ## entity-replaced string otherwise, in this stage, since any characters
5095     ## that would not be consumed are appended in the data state or in an
5096     ## appropriate attribute value state anyway.
5097    
5098     if ($self->{prev_state} == DATA_STATE) {
5099    
5100     $self->{state} = $self->{prev_state};
5101 wakaba 1.5 $self->{s_kwd} = '';
5102 wakaba 1.1 ## Reconsume.
5103     return ({type => CHARACTER_TOKEN,
5104     data => $data,
5105 wakaba 1.7 has_reference => $has_ref,
5106 wakaba 1.1 line => $self->{line_prev},
5107 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
5108 wakaba 1.1 });
5109     redo A;
5110     } else {
5111    
5112     $self->{ca}->{value} .= $data;
5113     $self->{ca}->{has_reference} = 1 if $has_ref;
5114     $self->{state} = $self->{prev_state};
5115 wakaba 1.5 $self->{s_kwd} = '';
5116 wakaba 1.1 ## Reconsume.
5117     redo A;
5118     }
5119 wakaba 1.8
5120     ## XML-only states
5121    
5122     } elsif ($self->{state} == PI_STATE) {
5123 wakaba 1.14 ## XML5: "Pi state" and "DOCTYPE pi state".
5124    
5125 wakaba 1.8 if ($is_space->{$self->{nc}} or
5126 wakaba 1.14 $self->{nc} == 0x003F or # ?
5127 wakaba 1.8 $self->{nc} == -1) {
5128 wakaba 1.14 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5129     ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5130     ## "DOCTYPE pi state": Parse error, switch to the "data
5131     ## state".
5132 wakaba 1.8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5133     line => $self->{line_prev},
5134     column => $self->{column_prev}
5135     - 1 * ($self->{nc} != -1));
5136     $self->{state} = BOGUS_COMMENT_STATE;
5137     ## Reconsume.
5138     $self->{ct} = {type => COMMENT_TOKEN,
5139     data => '?',
5140     line => $self->{line_prev},
5141     column => $self->{column_prev}
5142     - 1 * ($self->{nc} != -1),
5143     };
5144     redo A;
5145     } else {
5146 wakaba 1.14 ## XML5: "DOCTYPE pi state": Stay in the state.
5147 wakaba 1.8 $self->{ct} = {type => PI_TOKEN,
5148     target => chr $self->{nc},
5149     data => '',
5150     line => $self->{line_prev},
5151     column => $self->{column_prev} - 1,
5152     };
5153     $self->{state} = PI_TARGET_STATE;
5154    
5155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5156     $self->{line_prev} = $self->{line};
5157     $self->{column_prev} = $self->{column};
5158     $self->{column}++;
5159     $self->{nc}
5160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5161     } else {
5162     $self->{set_nc}->($self);
5163     }
5164    
5165     redo A;
5166     }
5167     } elsif ($self->{state} == PI_TARGET_STATE) {
5168     if ($is_space->{$self->{nc}}) {
5169     $self->{state} = PI_TARGET_AFTER_STATE;
5170    
5171     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5172     $self->{line_prev} = $self->{line};
5173     $self->{column_prev} = $self->{column};
5174     $self->{column}++;
5175     $self->{nc}
5176     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5177     } else {
5178     $self->{set_nc}->($self);
5179     }
5180    
5181     redo A;
5182     } elsif ($self->{nc} == -1) {
5183     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5184 wakaba 1.13 if ($self->{in_subset}) {
5185     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5186     } else {
5187     $self->{state} = DATA_STATE;
5188     $self->{s_kwd} = '';
5189     }
5190 wakaba 1.8 ## Reconsume.
5191     return ($self->{ct}); # pi
5192     redo A;
5193     } elsif ($self->{nc} == 0x003F) { # ?
5194     $self->{state} = PI_AFTER_STATE;
5195    
5196     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5197     $self->{line_prev} = $self->{line};
5198     $self->{column_prev} = $self->{column};
5199     $self->{column}++;
5200     $self->{nc}
5201     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5202     } else {
5203     $self->{set_nc}->($self);
5204     }
5205    
5206     redo A;
5207     } else {
5208     ## XML5: typo ("tag name" -> "target")
5209     $self->{ct}->{target} .= chr $self->{nc}; # pi
5210    
5211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5212     $self->{line_prev} = $self->{line};
5213     $self->{column_prev} = $self->{column};
5214     $self->{column}++;
5215     $self->{nc}
5216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5217     } else {
5218     $self->{set_nc}->($self);
5219     }
5220    
5221     redo A;
5222     }
5223     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5224     if ($is_space->{$self->{nc}}) {
5225     ## Stay in the state.
5226    
5227     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5228     $self->{line_prev} = $self->{line};
5229     $self->{column_prev} = $self->{column};
5230     $self->{column}++;
5231     $self->{nc}
5232     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5233     } else {
5234     $self->{set_nc}->($self);
5235     }
5236    
5237     redo A;
5238     } else {
5239     $self->{state} = PI_DATA_STATE;
5240     ## Reprocess.
5241     redo A;
5242     }
5243     } elsif ($self->{state} == PI_DATA_STATE) {
5244     if ($self->{nc} == 0x003F) { # ?
5245     $self->{state} = PI_DATA_AFTER_STATE;
5246    
5247     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5248     $self->{line_prev} = $self->{line};
5249     $self->{column_prev} = $self->{column};
5250     $self->{column}++;
5251     $self->{nc}
5252     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5253     } else {
5254     $self->{set_nc}->($self);
5255     }
5256    
5257     redo A;
5258     } elsif ($self->{nc} == -1) {
5259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5260 wakaba 1.13 if ($self->{in_subset}) {
5261 wakaba 1.14 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5262 wakaba 1.13 } else {
5263     $self->{state} = DATA_STATE;
5264     $self->{s_kwd} = '';
5265     }
5266 wakaba 1.8 ## Reprocess.
5267     return ($self->{ct}); # pi
5268     redo A;
5269     } else {
5270     $self->{ct}->{data} .= chr $self->{nc}; # pi
5271     $self->{read_until}->($self->{ct}->{data}, q[?],
5272     length $self->{ct}->{data});
5273     ## Stay in the state.
5274    
5275     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5276     $self->{line_prev} = $self->{line};
5277     $self->{column_prev} = $self->{column};
5278     $self->{column}++;
5279     $self->{nc}
5280     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5281     } else {
5282     $self->{set_nc}->($self);
5283     }
5284    
5285     ## Reprocess.
5286     redo A;
5287     }
5288     } elsif ($self->{state} == PI_AFTER_STATE) {
5289 wakaba 1.14 ## XML5: Part of "Pi after state".
5290    
5291 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5292 wakaba 1.13 if ($self->{in_subset}) {
5293     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5294     } else {
5295     $self->{state} = DATA_STATE;
5296     $self->{s_kwd} = '';
5297     }
5298 wakaba 1.8
5299     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5300     $self->{line_prev} = $self->{line};
5301     $self->{column_prev} = $self->{column};
5302     $self->{column}++;
5303     $self->{nc}
5304     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5305     } else {
5306     $self->{set_nc}->($self);
5307     }
5308    
5309     return ($self->{ct}); # pi
5310     redo A;
5311     } elsif ($self->{nc} == 0x003F) { # ?
5312     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5313     line => $self->{line_prev},
5314     column => $self->{column_prev}); ## XML5: no error
5315     $self->{ct}->{data} .= '?';
5316     $self->{state} = PI_DATA_AFTER_STATE;
5317    
5318     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319     $self->{line_prev} = $self->{line};
5320     $self->{column_prev} = $self->{column};
5321     $self->{column}++;
5322     $self->{nc}
5323     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324     } else {
5325     $self->{set_nc}->($self);
5326     }
5327    
5328     redo A;
5329     } else {
5330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5331     line => $self->{line_prev},
5332     column => $self->{column_prev}
5333     + 1 * ($self->{nc} == -1)); ## XML5: no error
5334     $self->{ct}->{data} .= '?'; ## XML5: not appended
5335     $self->{state} = PI_DATA_STATE;
5336     ## Reprocess.
5337     redo A;
5338     }
5339     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5340 wakaba 1.14 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5341    
5342 wakaba 1.8 if ($self->{nc} == 0x003E) { # >
5343 wakaba 1.13 if ($self->{in_subset}) {
5344     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5345     } else {
5346     $self->{state} = DATA_STATE;
5347     $self->{s_kwd} = '';
5348     }
5349 wakaba 1.8
5350     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5351     $self->{line_prev} = $self->{line};
5352     $self->{column_prev} = $self->{column};
5353     $self->{column}++;
5354     $self->{nc}
5355     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5356     } else {
5357     $self->{set_nc}->($self);
5358     }
5359    
5360     return ($self->{ct}); # pi
5361     redo A;
5362     } elsif ($self->{nc} == 0x003F) { # ?
5363     $self->{ct}->{data} .= '?';
5364     ## Stay in the state.
5365    
5366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5367     $self->{line_prev} = $self->{line};
5368     $self->{column_prev} = $self->{column};
5369     $self->{column}++;
5370     $self->{nc}
5371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5372     } else {
5373     $self->{set_nc}->($self);
5374     }
5375    
5376     redo A;
5377     } else {
5378     $self->{ct}->{data} .= '?'; ## XML5: not appended
5379     $self->{state} = PI_DATA_STATE;
5380     ## Reprocess.
5381     redo A;
5382     }
5383 wakaba 1.12
5384     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5385     if ($self->{nc} == 0x003C) { # <
5386 wakaba 1.13 $self->{state} = DOCTYPE_TAG_STATE;
5387 wakaba 1.12
5388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5389     $self->{line_prev} = $self->{line};
5390     $self->{column_prev} = $self->{column};
5391     $self->{column}++;
5392     $self->{nc}
5393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5394     } else {
5395     $self->{set_nc}->($self);
5396     }
5397    
5398     redo A;
5399     } elsif ($self->{nc} == 0x0025) { # %
5400     ## XML5: Not defined yet.
5401    
5402     ## TODO:
5403    
5404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5405     $self->{line_prev} = $self->{line};
5406     $self->{column_prev} = $self->{column};
5407     $self->{column}++;
5408     $self->{nc}
5409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5410     } else {
5411     $self->{set_nc}->($self);
5412     }
5413    
5414     redo A;
5415     } elsif ($self->{nc} == 0x005D) { # ]
5416 wakaba 1.13 delete $self->{in_subset};
5417 wakaba 1.12 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5418    
5419     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5420     $self->{line_prev} = $self->{line};
5421     $self->{column_prev} = $self->{column};
5422     $self->{column}++;
5423     $self->{nc}
5424     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5425     } else {
5426     $self->{set_nc}->($self);
5427     }
5428    
5429     redo A;
5430     } elsif ($is_space->{$self->{nc}}) {
5431     ## Stay in the state.
5432    
5433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5434     $self->{line_prev} = $self->{line};
5435     $self->{column_prev} = $self->{column};
5436     $self->{column}++;
5437     $self->{nc}
5438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5439     } else {
5440     $self->{set_nc}->($self);
5441     }
5442    
5443     redo A;
5444     } elsif ($self->{nc} == -1) {
5445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5446 wakaba 1.13 delete $self->{in_subset};
5447 wakaba 1.12 $self->{state} = DATA_STATE;
5448     $self->{s_kwd} = '';
5449     ## Reconsume.
5450 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5451 wakaba 1.12 redo A;
5452     } else {
5453     unless ($self->{internal_subset_tainted}) {
5454     ## XML5: No parse error.
5455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5456     $self->{internal_subset_tainted} = 1;
5457     }
5458     ## Stay in the state.
5459    
5460     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5461     $self->{line_prev} = $self->{line};
5462     $self->{column_prev} = $self->{column};
5463     $self->{column}++;
5464     $self->{nc}
5465     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5466     } else {
5467     $self->{set_nc}->($self);
5468     }
5469    
5470     redo A;
5471     }
5472     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5473     if ($self->{nc} == 0x003E) { # >
5474     $self->{state} = DATA_STATE;
5475     $self->{s_kwd} = '';
5476    
5477     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5478     $self->{line_prev} = $self->{line};
5479     $self->{column_prev} = $self->{column};
5480     $self->{column}++;
5481     $self->{nc}
5482     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5483     } else {
5484     $self->{set_nc}->($self);
5485     }
5486    
5487 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5488 wakaba 1.12 redo A;
5489     } elsif ($self->{nc} == -1) {
5490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5491     $self->{state} = DATA_STATE;
5492     $self->{s_kwd} = '';
5493     ## Reconsume.
5494 wakaba 1.13 return ({type => END_OF_DOCTYPE_TOKEN});
5495 wakaba 1.12 redo A;
5496     } else {
5497     ## XML5: No parse error and stay in the state.
5498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5499    
5500 wakaba 1.13 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5501    
5502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503     $self->{line_prev} = $self->{line};
5504     $self->{column_prev} = $self->{column};
5505     $self->{column}++;
5506     $self->{nc}
5507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508     } else {
5509     $self->{set_nc}->($self);
5510     }
5511    
5512     redo A;
5513     }
5514     } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5515     if ($self->{nc} == 0x003E) { # >
5516     $self->{state} = DATA_STATE;
5517     $self->{s_kwd} = '';
5518    
5519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5520     $self->{line_prev} = $self->{line};
5521     $self->{column_prev} = $self->{column};
5522     $self->{column}++;
5523     $self->{nc}
5524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5525     } else {
5526     $self->{set_nc}->($self);
5527     }
5528    
5529     return ({type => END_OF_DOCTYPE_TOKEN});
5530     redo A;
5531     } elsif ($self->{nc} == -1) {
5532     $self->{state} = DATA_STATE;
5533     $self->{s_kwd} = '';
5534     ## Reconsume.
5535     return ({type => END_OF_DOCTYPE_TOKEN});
5536     redo A;
5537     } else {
5538     ## Stay in the state.
5539    
5540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5541     $self->{line_prev} = $self->{line};
5542     $self->{column_prev} = $self->{column};
5543     $self->{column}++;
5544     $self->{nc}
5545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5546     } else {
5547     $self->{set_nc}->($self);
5548     }
5549    
5550     redo A;
5551     }
5552     } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5553     if ($self->{nc} == 0x0021) { # !
5554 wakaba 1.14 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5555 wakaba 1.13
5556     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5557     $self->{line_prev} = $self->{line};
5558     $self->{column_prev} = $self->{column};
5559     $self->{column}++;
5560     $self->{nc}
5561     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5562     } else {
5563     $self->{set_nc}->($self);
5564     }
5565    
5566     redo A;
5567     } elsif ($self->{nc} == 0x003F) { # ?
5568     $self->{state} = PI_STATE;
5569    
5570     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5571     $self->{line_prev} = $self->{line};
5572     $self->{column_prev} = $self->{column};
5573     $self->{column}++;
5574     $self->{nc}
5575     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5576     } else {
5577     $self->{set_nc}->($self);
5578     }
5579    
5580     redo A;
5581     } elsif ($self->{nc} == -1) {
5582     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5583     $self->{state} = DATA_STATE;
5584     $self->{s_kwd} = '';
5585     ## Reconsume.
5586     redo A;
5587     } else {
5588     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5589     line => $self->{line_prev},
5590     column => $self->{column_prev});
5591     $self->{state} = BOGUS_COMMENT_STATE;
5592     $self->{ct} = {type => COMMENT_TOKEN,
5593     data => '',
5594     }; ## NOTE: Will be discarded.
5595 wakaba 1.12
5596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5597     $self->{line_prev} = $self->{line};
5598     $self->{column_prev} = $self->{column};
5599     $self->{column}++;
5600     $self->{nc}
5601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5602     } else {
5603     $self->{set_nc}->($self);
5604     }
5605    
5606     redo A;
5607     }
5608 wakaba 1.14 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5609     ## XML5: "DOCTYPE markup declaration state".
5610    
5611     if ($self->{nc} == 0x002D) { # -
5612     $self->{state} = MD_HYPHEN_STATE;
5613    
5614     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5615     $self->{line_prev} = $self->{line};
5616     $self->{column_prev} = $self->{column};
5617     $self->{column}++;
5618     $self->{nc}
5619     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5620     } else {
5621     $self->{set_nc}->($self);
5622     }
5623    
5624     redo A;
5625 wakaba 1.17 } elsif ($self->{nc} == 0x0045 or # E
5626     $self->{nc} == 0x0065) { # e
5627 wakaba 1.14 $self->{state} = MD_E_STATE;
5628     $self->{kwd} = chr $self->{nc};
5629    
5630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5631     $self->{line_prev} = $self->{line};
5632     $self->{column_prev} = $self->{column};
5633     $self->{column}++;
5634     $self->{nc}
5635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5636     } else {
5637     $self->{set_nc}->($self);
5638     }
5639    
5640     redo A;
5641 wakaba 1.17 } elsif ($self->{nc} == 0x0041 or # A
5642     $self->{nc} == 0x0061) { # a
5643 wakaba 1.14 $self->{state} = MD_ATTLIST_STATE;
5644     $self->{kwd} = chr $self->{nc};
5645    
5646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5647     $self->{line_prev} = $self->{line};
5648     $self->{column_prev} = $self->{column};
5649     $self->{column}++;
5650     $self->{nc}
5651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5652     } else {
5653     $self->{set_nc}->($self);
5654     }
5655    
5656     redo A;
5657 wakaba 1.17 } elsif ($self->{nc} == 0x004E or # N
5658     $self->{nc} == 0x006E) { # n
5659 wakaba 1.14 $self->{state} = MD_NOTATION_STATE;
5660     $self->{kwd} = chr $self->{nc};
5661    
5662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5663     $self->{line_prev} = $self->{line};
5664     $self->{column_prev} = $self->{column};
5665     $self->{column}++;
5666     $self->{nc}
5667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5668     } else {
5669     $self->{set_nc}->($self);
5670     }
5671    
5672     redo A;
5673     } else {
5674     #
5675     }
5676    
5677     ## XML5: No parse error.
5678     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5679     line => $self->{line_prev},
5680     column => $self->{column_prev} - 1);
5681     ## Reconsume.
5682     $self->{state} = BOGUS_COMMENT_STATE;
5683     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5684     redo A;
5685     } elsif ($self->{state} == MD_E_STATE) {
5686 wakaba 1.17 if ($self->{nc} == 0x004E or # N
5687     $self->{nc} == 0x006E) { # n
5688 wakaba 1.14 $self->{state} = MD_ENTITY_STATE;
5689     $self->{kwd} .= chr $self->{nc};
5690    
5691     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5692     $self->{line_prev} = $self->{line};
5693     $self->{column_prev} = $self->{column};
5694     $self->{column}++;
5695     $self->{nc}
5696     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5697     } else {
5698     $self->{set_nc}->($self);
5699     }
5700    
5701     redo A;
5702 wakaba 1.17 } elsif ($self->{nc} == 0x004C or # L
5703     $self->{nc} == 0x006C) { # l
5704 wakaba 1.14 ## XML5: <!ELEMENT> not supported.
5705     $self->{state} = MD_ELEMENT_STATE;
5706     $self->{kwd} .= chr $self->{nc};
5707    
5708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5709     $self->{line_prev} = $self->{line};
5710     $self->{column_prev} = $self->{column};
5711     $self->{column}++;
5712     $self->{nc}
5713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5714     } else {
5715     $self->{set_nc}->($self);
5716     }
5717    
5718     redo A;
5719     } else {
5720     ## XML5: No parse error.
5721     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5722     line => $self->{line_prev},
5723     column => $self->{column_prev} - 2
5724     + 1 * ($self->{nc} == -1));
5725     ## Reconsume.
5726     $self->{state} = BOGUS_COMMENT_STATE;
5727     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5728     redo A;
5729     }
5730     } elsif ($self->{state} == MD_ENTITY_STATE) {
5731 wakaba 1.17 if ($self->{nc} == [
5732     undef,
5733     undef,
5734     0x0054, # T
5735     0x0049, # I
5736     0x0054, # T
5737     ]->[length $self->{kwd}] or
5738     $self->{nc} == [
5739     undef,
5740     undef,
5741     0x0074, # t
5742     0x0069, # i
5743     0x0074, # t
5744     ]->[length $self->{kwd}]) {
5745 wakaba 1.14 ## Stay in the state.
5746     $self->{kwd} .= chr $self->{nc};
5747    
5748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749     $self->{line_prev} = $self->{line};
5750     $self->{column_prev} = $self->{column};
5751     $self->{column}++;
5752     $self->{nc}
5753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754     } else {
5755     $self->{set_nc}->($self);
5756     }
5757    
5758     redo A;
5759 wakaba 1.17 } elsif ((length $self->{kwd}) == 5 and
5760     ($self->{nc} == 0x0059 or # Y
5761     $self->{nc} == 0x0079)) { # y
5762     if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5763     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5764     text => 'ENTITY',
5765     line => $self->{line_prev},
5766     column => $self->{column_prev} - 4);
5767     }
5768     $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5769 wakaba 1.14 line => $self->{line_prev},
5770     column => $self->{column_prev} - 6};
5771     $self->{state} = DOCTYPE_MD_STATE;
5772    
5773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5774     $self->{line_prev} = $self->{line};
5775     $self->{column_prev} = $self->{column};
5776     $self->{column}++;
5777     $self->{nc}
5778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5779     } else {
5780     $self->{set_nc}->($self);
5781     }
5782    
5783     redo A;
5784     } else {
5785     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5786     line => $self->{line_prev},
5787     column => $self->{column_prev} - 1
5788     - (length $self->{kwd})
5789     + 1 * ($self->{nc} == -1));
5790     $self->{state} = BOGUS_COMMENT_STATE;
5791     ## Reconsume.
5792     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5793     redo A;
5794     }
5795     } elsif ($self->{state} == MD_ELEMENT_STATE) {
5796 wakaba 1.17 if ($self->{nc} == [
5797     undef,
5798     undef,
5799     0x0045, # E
5800     0x004D, # M
5801     0x0045, # E
5802     0x004E, # N
5803     ]->[length $self->{kwd}] or
5804     $self->{nc} == [
5805     undef,
5806     undef,
5807     0x0065, # e
5808     0x006D, # m
5809     0x0065, # e
5810     0x006E, # n
5811     ]->[length $self->{kwd}]) {
5812 wakaba 1.14 ## Stay in the state.
5813     $self->{kwd} .= chr $self->{nc};
5814    
5815     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5816     $self->{line_prev} = $self->{line};
5817     $self->{column_prev} = $self->{column};
5818     $self->{column}++;
5819     $self->{nc}
5820     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5821     } else {
5822     $self->{set_nc}->($self);
5823     }
5824    
5825     redo A;
5826 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5827     ($self->{nc} == 0x0054 or # T
5828     $self->{nc} == 0x0074)) { # t
5829     if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5830     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5831     text => 'ELEMENT',
5832     line => $self->{line_prev},
5833     column => $self->{column_prev} - 5);
5834     }
5835 wakaba 1.14 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5836     line => $self->{line_prev},
5837     column => $self->{column_prev} - 6};
5838     $self->{state} = DOCTYPE_MD_STATE;
5839    
5840     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5841     $self->{line_prev} = $self->{line};
5842     $self->{column_prev} = $self->{column};
5843     $self->{column}++;
5844     $self->{nc}
5845     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5846     } else {
5847     $self->{set_nc}->($self);
5848     }
5849    
5850     redo A;
5851     } else {
5852     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5853     line => $self->{line_prev},
5854     column => $self->{column_prev} - 1
5855     - (length $self->{kwd})
5856     + 1 * ($self->{nc} == -1));
5857     $self->{state} = BOGUS_COMMENT_STATE;
5858     ## Reconsume.
5859     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5860     redo A;
5861     }
5862     } elsif ($self->{state} == MD_ATTLIST_STATE) {
5863 wakaba 1.17 if ($self->{nc} == [
5864     undef,
5865     0x0054, # T
5866     0x0054, # T
5867     0x004C, # L
5868     0x0049, # I
5869     0x0053, # S
5870     ]->[length $self->{kwd}] or
5871     $self->{nc} == [
5872     undef,
5873     0x0074, # t
5874     0x0074, # t
5875     0x006C, # l
5876     0x0069, # i
5877     0x0073, # s
5878     ]->[length $self->{kwd}]) {
5879 wakaba 1.14 ## Stay in the state.
5880     $self->{kwd} .= chr $self->{nc};
5881    
5882     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5883     $self->{line_prev} = $self->{line};
5884     $self->{column_prev} = $self->{column};
5885     $self->{column}++;
5886     $self->{nc}
5887     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5888     } else {
5889     $self->{set_nc}->($self);
5890     }
5891    
5892     redo A;
5893 wakaba 1.17 } elsif ((length $self->{kwd}) == 6 and
5894     ($self->{nc} == 0x0054 or # T
5895     $self->{nc} == 0x0074)) { # t
5896     if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5897     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5898     text => 'ATTLIST',
5899     line => $self->{line_prev},
5900     column => $self->{column_prev} - 5);
5901     }
5902 wakaba 1.14 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5903 wakaba 1.15 attrdefs => [],
5904 wakaba 1.14 line => $self->{line_prev},
5905     column => $self->{column_prev} - 6};
5906     $self->{state} = DOCTYPE_MD_STATE;
5907    
5908     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5909     $self->{line_prev} = $self->{line};
5910     $self->{column_prev} = $self->{column};
5911     $self->{column}++;
5912     $self->{nc}
5913     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5914     } else {
5915     $self->{set_nc}->($self);
5916     }
5917    
5918     redo A;
5919     } else {
5920     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5921     line => $self->{line_prev},
5922     column => $self->{column_prev} - 1
5923     - (length $self->{kwd})
5924     + 1 * ($self->{nc} == -1));
5925     $self->{state} = BOGUS_COMMENT_STATE;
5926     ## Reconsume.
5927     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5928     redo A;
5929     }
5930     } elsif ($self->{state} == MD_NOTATION_STATE) {
5931 wakaba 1.17 if ($self->{nc} == [
5932     undef,
5933     0x004F, # O
5934     0x0054, # T
5935     0x0041, # A
5936     0x0054, # T
5937     0x0049, # I
5938     0x004F, # O
5939     ]->[length $self->{kwd}] or
5940     $self->{nc} == [
5941     undef,
5942     0x006F, # o
5943     0x0074, # t
5944     0x0061, # a
5945     0x0074, # t
5946     0x0069, # i
5947     0x006F, # o
5948     ]->[length $self->{kwd}]) {
5949 wakaba 1.14 ## Stay in the state.
5950     $self->{kwd} .= chr $self->{nc};
5951    
5952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5953     $self->{line_prev} = $self->{line};
5954     $self->{column_prev} = $self->{column};
5955     $self->{column}++;
5956     $self->{nc}
5957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5958     } else {
5959     $self->{set_nc}->($self);
5960     }
5961    
5962     redo A;
5963 wakaba 1.17 } elsif ((length $self->{kwd}) == 7 and
5964     ($self->{nc} == 0x004E or # N
5965     $self->{nc} == 0x006E)) { # n
5966     if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5967     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5968     text => 'NOTATION',
5969     line => $self->{line_prev},
5970     column => $self->{column_prev} - 6);
5971     }
5972 wakaba 1.14 $self->{ct} = {type => NOTATION_TOKEN, name => '',
5973     line => $self->{line_prev},
5974     column => $self->{column_prev} - 6};
5975     $self->{state} = DOCTYPE_MD_STATE;
5976    
5977     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5978     $self->{line_prev} = $self->{line};
5979     $self->{column_prev} = $self->{column};
5980     $self->{column}++;
5981     $self->{nc}
5982     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5983     } else {
5984     $self->{set_nc}->($self);
5985     }
5986    
5987     redo A;
5988     } else {
5989     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5990     line => $self->{line_prev},
5991     column => $self->{column_prev} - 1
5992     - (length $self->{kwd})
5993     + 1 * ($self->{nc} == -1));
5994     $self->{state} = BOGUS_COMMENT_STATE;
5995     ## Reconsume.
5996     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5997     redo A;
5998     }
5999     } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6000     ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6001     ## "DOCTYPE NOTATION state".
6002    
6003     if ($is_space->{$self->{nc}}) {
6004     ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6005     $self->{state} = BEFORE_MD_NAME_STATE;
6006    
6007     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6008     $self->{line_prev} = $self->{line};
6009     $self->{column_prev} = $self->{column};
6010     $self->{column}++;
6011     $self->{nc}
6012     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6013     } else {
6014     $self->{set_nc}->($self);
6015     }
6016    
6017     redo A;
6018     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6019     $self->{nc} == 0x0025) { # %
6020     ## XML5: Switch to the "DOCTYPE bogus comment state".
6021     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6022     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6023    
6024     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6025     $self->{line_prev} = $self->{line};
6026     $self->{column_prev} = $self->{column};
6027     $self->{column}++;
6028     $self->{nc}
6029     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6030     } else {
6031     $self->{set_nc}->($self);
6032     }
6033    
6034     redo A;
6035     } elsif ($self->{nc} == -1) {
6036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6037     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6038     ## Reconsume.
6039     redo A;
6040     } elsif ($self->{nc} == 0x003E) { # >
6041     ## XML5: Switch to the "DOCTYPE bogus comment state".
6042     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6043     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6044    
6045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6046     $self->{line_prev} = $self->{line};
6047     $self->{column_prev} = $self->{column};
6048     $self->{column}++;
6049     $self->{nc}
6050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6051     } else {
6052     $self->{set_nc}->($self);
6053     }
6054    
6055     redo A;
6056     } else {
6057     ## XML5: Switch to the "DOCTYPE bogus comment state".
6058     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6059     $self->{state} = BEFORE_MD_NAME_STATE;
6060     redo A;
6061     }
6062     } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6063     ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6064     ## before state", "DOCTYPE ATTLIST name before state".
6065    
6066     if ($is_space->{$self->{nc}}) {
6067     ## Stay in the state.
6068    
6069     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6070     $self->{line_prev} = $self->{line};
6071     $self->{column_prev} = $self->{column};
6072     $self->{column}++;
6073     $self->{nc}
6074     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6075     } else {
6076     $self->{set_nc}->($self);
6077     }
6078    
6079     redo A;
6080     } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6081     $self->{nc} == 0x0025) { # %
6082     $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6083    
6084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6085     $self->{line_prev} = $self->{line};
6086     $self->{column_prev} = $self->{column};
6087     $self->{column}++;
6088     $self->{nc}
6089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6090     } else {
6091     $self->{set_nc}->($self);
6092     }
6093    
6094     redo A;
6095     } elsif ($self->{nc} == 0x003E) { # >
6096     ## XML5: Same as "Anything else".
6097     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6098     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6099    
6100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6101     $self->{line_prev} = $self->{line};
6102     $self->{column_prev} = $self->{column};
6103     $self->{column}++;
6104     $self->{nc}
6105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6106     } else {
6107     $self->{set_nc}->($self);
6108     }
6109    
6110     redo A;
6111     } elsif ($self->{nc} == -1) {
6112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6113     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6114     ## Reconsume.
6115     redo A;
6116     } else {
6117     ## XML5: [ATTLIST] Not defined yet.
6118     $self->{ct}->{name} .= chr $self->{nc};
6119     $self->{state} = MD_NAME_STATE;
6120    
6121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6122     $self->{line_prev} = $self->{line};
6123     $self->{column_prev} = $self->{column};
6124     $self->{column}++;
6125     $self->{nc}
6126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6127     } else {
6128     $self->{set_nc}->($self);
6129     }
6130    
6131     redo A;
6132     }
6133     } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6134     if ($is_space->{$self->{nc}}) {
6135     ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6136     $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6137     $self->{state} = BEFORE_MD_NAME_STATE;
6138 wakaba 1.8
6139 wakaba 1.14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140     $self->{line_prev} = $self->{line};
6141     $self->{column_prev} = $self->{column};
6142     $self->{column}++;
6143     $self->{nc}
6144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145     } else {
6146     $self->{set_nc}->($self);
6147     }
6148    
6149     redo A;
6150     } elsif ($self->{nc} == 0x003E) { # >
6151     ## XML5: Same as "Anything else".
6152     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6153     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6154    
6155     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6156     $self->{line_prev} = $self->{line};
6157     $self->{column_prev} = $self->{column};
6158     $self->{column}++;
6159     $self->{nc}
6160     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6161     } else {
6162     $self->{set_nc}->($self);
6163     }
6164    
6165     redo A;
6166     } elsif ($self->{nc} == -1) {
6167     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6168     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6169     ## Reconsume.
6170     redo A;
6171     } else {
6172     ## XML5: No parse error.
6173     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6174     $self->{state} = BOGUS_COMMENT_STATE;
6175     $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6176     ## Reconsume.
6177     redo A;
6178     }
6179     } elsif ($self->{state} == MD_NAME_STATE) {
6180     ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6181    
6182     if ($is_space->{$self->{nc}}) {
6183 wakaba 1.16 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6184     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6185     } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6186     ## TODO: ...
6187     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6188     } else { # ENTITY/NOTATION
6189     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6190     }
6191 wakaba 1.14
6192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6193     $self->{line_prev} = $self->{line};
6194     $self->{column_prev} = $self->{column};
6195     $self->{column}++;
6196     $self->{nc}
6197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6198     } else {
6199     $self->{set_nc}->($self);
6200     }
6201    
6202     redo A;
6203     } elsif ($self->{nc} == 0x003E) { # >
6204     if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6205     #
6206     } else {
6207 wakaba 1.16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6208 wakaba 1.14 }
6209     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6210    
6211     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6212     $self->{line_prev} = $self->{line};
6213     $self->{column_prev} = $self->{column};
6214     $self->{column}++;
6215     $self->{nc}
6216     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6217     } else {
6218     $self->{set_nc}->($self);
6219     }
6220    
6221     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6222     redo A;
6223     } elsif ($self->{nc} == -1) {
6224     ## XML5: [ATTLIST] No parse error.
6225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6226     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6227     ## Reconsume.
6228     return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6229     redo A;
6230     } else {
6231     ## XML5: [ATTLIST] Not defined yet.
6232     $self->{ct}->{name} .= chr $self->{nc};
6233     ## Stay in the state.
6234    
6235     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6236     $self->{line_prev} = $self->{line};
6237     $self->{column_prev} = $self->{column};
6238     $self->{column}++;
6239     $self->{nc}
6240     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6241     } else {
6242     $self->{set_nc}->($self);
6243     }
6244    
6245     redo A;
6246     }
6247     } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6248     if ($is_space->{$self->{nc}}) {
6249     ## Stay in the state.
6250    
6251     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6252     $self->{line_prev} = $self->{line};
6253     $self->{column_prev} = $self->{column};
6254     $self->{column}++;
6255     $self->{nc}
6256     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6257     } else {
6258     $self->{set_nc}->($self);
6259     }
6260    
6261     redo A;
6262     } elsif ($self->{nc} == 0x003E) { # >
6263     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6264    
6265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6266     $self->{line_prev} = $self->{line};
6267     $self->{column_prev} = $self->{column};
6268     $self->{column}++;
6269     $self->{nc}
6270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6271     } else {
6272     $self->{set_nc}->($self);
6273     }
6274    
6275     return ($self->{ct}); # ATTLIST
6276     redo A;
6277     } elsif ($self->{nc} == -1) {
6278     ## XML5: No parse error.
6279     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6280     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6281 wakaba 1.15 return ($self->{ct});
6282 wakaba 1.14 redo A;
6283     } else {
6284     ## XML5: Not defined yet.
6285 wakaba 1.15 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6286     tokens => [],
6287     line => $self->{line}, column => $self->{column}};
6288     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6289    
6290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6291     $self->{line_prev} = $self->{line};
6292     $self->{column_prev} = $self->{column};
6293     $self->{column}++;
6294     $self->{nc}
6295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6296     } else {
6297     $self->{set_nc}->($self);
6298     }
6299    
6300     redo A;
6301     }
6302     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6303     if ($is_space->{$self->{nc}}) {
6304     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6305    
6306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6307     $self->{line_prev} = $self->{line};
6308     $self->{column_prev} = $self->{column};
6309     $self->{column}++;
6310     $self->{nc}
6311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6312     } else {
6313     $self->{set_nc}->($self);
6314     }
6315    
6316     redo A;
6317     } elsif ($self->{nc} == 0x003E) { # >
6318     ## XML5: Same as "anything else".
6319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6320     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6321    
6322     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6323     $self->{line_prev} = $self->{line};
6324     $self->{column_prev} = $self->{column};
6325     $self->{column}++;
6326     $self->{nc}
6327     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6328     } else {
6329     $self->{set_nc}->($self);
6330     }
6331    
6332     return ($self->{ct}); # ATTLIST
6333     redo A;
6334     } elsif ($self->{nc} == 0x0028) { # (
6335     ## XML5: Same as "anything else".
6336     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6337     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6338    
6339     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6340     $self->{line_prev} = $self->{line};
6341     $self->{column_prev} = $self->{column};
6342     $self->{column}++;
6343     $self->{nc}
6344     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6345     } else {
6346     $self->{set_nc}->($self);
6347     }
6348    
6349     redo A;
6350     } elsif ($self->{nc} == -1) {
6351     ## XML5: No parse error.
6352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6353     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6354    
6355     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6356     $self->{line_prev} = $self->{line};
6357     $self->{column_prev} = $self->{column};
6358     $self->{column}++;
6359     $self->{nc}
6360     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6361     } else {
6362     $self->{set_nc}->($self);
6363     }
6364    
6365     return ($self->{ct}); # ATTLIST
6366     redo A;
6367     } else {
6368     ## XML5: Not defined yet.
6369     $self->{ca}->{name} .= chr $self->{nc};
6370     ## Stay in the state.
6371    
6372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6373     $self->{line_prev} = $self->{line};
6374     $self->{column_prev} = $self->{column};
6375     $self->{column}++;
6376     $self->{nc}
6377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6378     } else {
6379     $self->{set_nc}->($self);
6380     }
6381    
6382 wakaba 1.14 redo A;
6383     }
6384 wakaba 1.15 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6385     if ($is_space->{$self->{nc}}) {
6386     ## Stay in the state.
6387    
6388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6389     $self->{line_prev} = $self->{line};
6390     $self->{column_prev} = $self->{column};
6391     $self->{column}++;
6392     $self->{nc}
6393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6394     } else {
6395     $self->{set_nc}->($self);
6396     }
6397    
6398     redo A;
6399     } elsif ($self->{nc} == 0x003E) { # >
6400     ## XML5: Same as "anything else".
6401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6402     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6403    
6404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6405     $self->{line_prev} = $self->{line};
6406     $self->{column_prev} = $self->{column};
6407     $self->{column}++;
6408     $self->{nc}
6409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6410     } else {
6411     $self->{set_nc}->($self);
6412     }
6413    
6414     return ($self->{ct}); # ATTLIST
6415     redo A;
6416     } elsif ($self->{nc} == 0x0028) { # (
6417     ## XML5: Same as "anything else".
6418     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6419    
6420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6421     $self->{line_prev} = $self->{line};
6422     $self->{column_prev} = $self->{column};
6423     $self->{column}++;
6424     $self->{nc}
6425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6426     } else {
6427     $self->{set_nc}->($self);
6428     }
6429    
6430     redo A;
6431     } elsif ($self->{nc} == -1) {
6432     ## XML5: No parse error.
6433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6434     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6435    
6436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6437     $self->{line_prev} = $self->{line};
6438     $self->{column_prev} = $self->{column};
6439     $self->{column}++;
6440     $self->{nc}
6441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6442     } else {
6443     $self->{set_nc}->($self);
6444     }
6445    
6446     return ($self->{ct});
6447     redo A;
6448     } else {
6449     ## XML5: Not defined yet.
6450     $self->{ca}->{type} = chr $self->{nc};
6451     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6452    
6453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454     $self->{line_prev} = $self->{line};
6455     $self->{column_prev} = $self->{column};
6456     $self->{column}++;
6457     $self->{nc}
6458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459     } else {
6460     $self->{set_nc}->($self);
6461     }
6462    
6463     redo A;
6464     }
6465     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6466     if ($is_space->{$self->{nc}}) {
6467     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6468    
6469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470     $self->{line_prev} = $self->{line};
6471     $self->{column_prev} = $self->{column};
6472     $self->{column}++;
6473     $self->{nc}
6474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475     } else {
6476     $self->{set_nc}->($self);
6477     }
6478    
6479     redo A;
6480     } elsif ($self->{nc} == 0x0023) { # #
6481     ## XML5: Same as "anything else".
6482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6483     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6484    
6485     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6486     $self->{line_prev} = $self->{line};
6487     $self->{column_prev} = $self->{column};
6488     $self->{column}++;
6489     $self->{nc}
6490     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6491     } else {
6492     $self->{set_nc}->($self);
6493     }
6494    
6495     redo A;
6496     } elsif ($self->{nc} == 0x0022) { # "
6497     ## XML5: Same as "anything else".
6498     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6499     $self->{ca}->{value} = '';
6500     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6501    
6502     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503     $self->{line_prev} = $self->{line};
6504     $self->{column_prev} = $self->{column};
6505     $self->{column}++;
6506     $self->{nc}
6507     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508     } else {
6509     $self->{set_nc}->($self);
6510     }
6511    
6512     redo A;
6513     } elsif ($self->{nc} == 0x0027) { # '
6514     ## XML5: Same as "anything else".
6515     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6516     $self->{ca}->{value} = '';
6517     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6518    
6519     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6520     $self->{line_prev} = $self->{line};
6521     $self->{column_prev} = $self->{column};
6522     $self->{column}++;
6523     $self->{nc}
6524     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6525     } else {
6526     $self->{set_nc}->($self);
6527     }
6528    
6529     redo A;
6530     } elsif ($self->{nc} == 0x003E) { # >
6531     ## XML5: Same as "anything else".
6532     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6533     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6534    
6535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536     $self->{line_prev} = $self->{line};
6537     $self->{column_prev} = $self->{column};
6538     $self->{column}++;
6539     $self->{nc}
6540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541     } else {
6542     $self->{set_nc}->($self);
6543     }
6544    
6545     return ($self->{ct}); # ATTLIST
6546     redo A;
6547     } elsif ($self->{nc} == 0x0028) { # (
6548     ## XML5: Same as "anything else".
6549     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6550     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6551    
6552     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6553     $self->{line_prev} = $self->{line};
6554     $self->{column_prev} = $self->{column};
6555     $self->{column}++;
6556     $self->{nc}
6557     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6558     } else {
6559     $self->{set_nc}->($self);
6560     }
6561    
6562     redo A;
6563     } elsif ($self->{nc} == -1) {
6564     ## XML5: No parse error.
6565     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6566     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6567    
6568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6569     $self->{line_prev} = $self->{line};
6570     $self->{column_prev} = $self->{column};
6571     $self->{column}++;
6572     $self->{nc}
6573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6574     } else {
6575     $self->{set_nc}->($self);
6576     }
6577    
6578     return ($self->{ct});
6579     redo A;
6580     } else {
6581     ## XML5: Not defined yet.
6582     $self->{ca}->{type} .= chr $self->{nc};
6583     ## Stay in the state.
6584    
6585     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6586     $self->{line_prev} = $self->{line};
6587     $self->{column_prev} = $self->{column};
6588     $self->{column}++;
6589     $self->{nc}
6590     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6591     } else {
6592     $self->{set_nc}->($self);
6593     }
6594    
6595     redo A;
6596     }
6597     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6598     if ($is_space->{$self->{nc}}) {
6599     ## Stay in the state.
6600    
6601     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6602     $self->{line_prev} = $self->{line};
6603     $self->{column_prev} = $self->{column};
6604     $self->{column}++;
6605     $self->{nc}
6606     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6607     } else {
6608     $self->{set_nc}->($self);
6609     }
6610    
6611     redo A;
6612     } elsif ($self->{nc} == 0x0028) { # (
6613     ## XML5: Same as "anything else".
6614     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6615    
6616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617     $self->{line_prev} = $self->{line};
6618     $self->{column_prev} = $self->{column};
6619     $self->{column}++;
6620     $self->{nc}
6621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622     } else {
6623     $self->{set_nc}->($self);
6624     }
6625    
6626     redo A;
6627     } elsif ($self->{nc} == 0x0023) { # #
6628     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6629    
6630     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6631     $self->{line_prev} = $self->{line};
6632     $self->{column_prev} = $self->{column};
6633     $self->{column}++;
6634     $self->{nc}
6635     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6636     } else {
6637     $self->{set_nc}->($self);
6638     }
6639    
6640     redo A;
6641     } elsif ($self->{nc} == 0x0022) { # "
6642     ## XML5: Same as "anything else".
6643     $self->{ca}->{value} = '';
6644     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6645    
6646     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6647     $self->{line_prev} = $self->{line};
6648     $self->{column_prev} = $self->{column};
6649     $self->{column}++;
6650     $self->{nc}
6651     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6652     } else {
6653     $self->{set_nc}->($self);
6654     }
6655    
6656     redo A;
6657     } elsif ($self->{nc} == 0x0027) { # '
6658     ## XML5: Same as "anything else".
6659     $self->{ca}->{value} = '';
6660     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6661    
6662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6663     $self->{line_prev} = $self->{line};
6664     $self->{column_prev} = $self->{column};
6665     $self->{column}++;
6666     $self->{nc}
6667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6668     } else {
6669     $self->{set_nc}->($self);
6670     }
6671    
6672     redo A;
6673     } elsif ($self->{nc} == 0x003E) { # >
6674     ## XML5: Same as "anything else".
6675     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6676     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6677    
6678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6679     $self->{line_prev} = $self->{line};
6680     $self->{column_prev} = $self->{column};
6681     $self->{column}++;
6682     $self->{nc}
6683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6684     } else {
6685     $self->{set_nc}->($self);
6686     }
6687    
6688     return ($self->{ct}); # ATTLIST
6689     redo A;
6690     } elsif ($self->{nc} == -1) {
6691     ## XML5: No parse error.
6692     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6693     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6694    
6695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6696     $self->{line_prev} = $self->{line};
6697     $self->{column_prev} = $self->{column};
6698     $self->{column}++;
6699     $self->{nc}
6700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6701     } else {
6702     $self->{set_nc}->($self);
6703     }
6704    
6705     return ($self->{ct});
6706     redo A;
6707     } else {
6708     ## XML5: Switch to the "DOCTYPE bogus comment state".
6709     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6710     $self->{ca}->{value} = '';
6711     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6712     ## Reconsume.
6713     redo A;
6714     }
6715     } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6716     if ($is_space->{$self->{nc}}) {
6717     ## Stay in the state.
6718    
6719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6720     $self->{line_prev} = $self->{line};
6721     $self->{column_prev} = $self->{column};
6722     $self->{column}++;
6723     $self->{nc}
6724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6725     } else {
6726     $self->{set_nc}->($self);
6727     }
6728    
6729     redo A;
6730     } elsif ($self->{nc} == 0x007C) { # |
6731     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6732     ## Stay in the state.
6733    
6734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6735     $self->{line_prev} = $self->{line};
6736     $self->{column_prev} = $self->{column};
6737     $self->{column}++;
6738     $self->{nc}
6739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6740     } else {
6741     $self->{set_nc}->($self);
6742     }
6743    
6744     redo A;
6745     } elsif ($self->{nc} == 0x0029) { # )
6746     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6747     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6748    
6749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6750     $self->{line_prev} = $self->{line};
6751     $self->{column_prev} = $self->{column};
6752     $self->{column}++;
6753     $self->{nc}
6754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6755     } else {
6756     $self->{set_nc}->($self);
6757     }
6758    
6759     redo A;
6760     } elsif ($self->{nc} == 0x003E) { # >
6761     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6762     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6763    
6764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765     $self->{line_prev} = $self->{line};
6766     $self->{column_prev} = $self->{column};
6767     $self->{column}++;
6768     $self->{nc}
6769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770     } else {
6771     $self->{set_nc}->($self);
6772     }
6773    
6774     return ($self->{ct}); # ATTLIST
6775     redo A;
6776     } elsif ($self->{nc} == -1) {
6777     ## XML5: No parse error.
6778     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6779     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6780    
6781     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6782     $self->{line_prev} = $self->{line};
6783     $self->{column_prev} = $self->{column};
6784     $self->{column}++;
6785     $self->{nc}
6786     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6787     } else {
6788     $self->{set_nc}->($self);
6789     }
6790    
6791     return ($self->{ct});
6792     redo A;
6793     } else {
6794     push @{$self->{ca}->{tokens}}, chr $self->{nc};
6795     $self->{state} = ALLOWED_TOKEN_STATE;
6796    
6797     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6798     $self->{line_prev} = $self->{line};
6799     $self->{column_prev} = $self->{column};
6800     $self->{column}++;
6801     $self->{nc}
6802     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6803     } else {
6804     $self->{set_nc}->($self);
6805     }
6806    
6807     redo A;
6808     }
6809     } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6810     if ($is_space->{$self->{nc}}) {
6811     $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6812    
6813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6814     $self->{line_prev} = $self->{line};
6815     $self->{column_prev} = $self->{column};
6816     $self->{column}++;
6817     $self->{nc}
6818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6819     } else {
6820     $self->{set_nc}->($self);
6821     }
6822    
6823     redo A;
6824     } elsif ($self->{nc} == 0x007C) { # |
6825     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6826    
6827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6828     $self->{line_prev} = $self->{line};
6829     $self->{column_prev} = $self->{column};
6830     $self->{column}++;
6831     $self->{nc}
6832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6833     } else {
6834     $self->{set_nc}->($self);
6835     }
6836    
6837     redo A;
6838     } elsif ($self->{nc} == 0x0029) { # )
6839     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6840    
6841     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842     $self->{line_prev} = $self->{line};
6843     $self->{column_prev} = $self->{column};
6844     $self->{column}++;
6845     $self->{nc}
6846     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847     } else {
6848     $self->{set_nc}->($self);
6849     }
6850    
6851     redo A;
6852     } elsif ($self->{nc} == 0x003E) { # >
6853     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6854     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6855    
6856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6857     $self->{line_prev} = $self->{line};
6858     $self->{column_prev} = $self->{column};
6859     $self->{column}++;
6860     $self->{nc}
6861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6862     } else {
6863     $self->{set_nc}->($self);
6864     }
6865    
6866     return ($self->{ct}); # ATTLIST
6867     redo A;
6868     } elsif ($self->{nc} == -1) {
6869     ## XML5: No parse error.
6870     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6871     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6872    
6873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6874     $self->{line_prev} = $self->{line};
6875     $self->{column_prev} = $self->{column};
6876     $self->{column}++;
6877     $self->{nc}
6878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6879     } else {
6880     $self->{set_nc}->($self);
6881     }
6882    
6883     return ($self->{ct});
6884     redo A;
6885     } else {
6886     $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6887     ## Stay in the state.
6888    
6889     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6890     $self->{line_prev} = $self->{line};
6891     $self->{column_prev} = $self->{column};
6892     $self->{column}++;
6893     $self->{nc}
6894     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6895     } else {
6896     $self->{set_nc}->($self);
6897     }
6898    
6899     redo A;
6900     }
6901     } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6902     if ($is_space->{$self->{nc}}) {
6903     ## Stay in the state.
6904    
6905     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6906     $self->{line_prev} = $self->{line};
6907     $self->{column_prev} = $self->{column};
6908     $self->{column}++;
6909     $self->{nc}
6910     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6911     } else {
6912     $self->{set_nc}->($self);
6913     }
6914    
6915     redo A;
6916     } elsif ($self->{nc} == 0x007C) { # |
6917     $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6918    
6919     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6920     $self->{line_prev} = $self->{line};
6921     $self->{column_prev} = $self->{column};
6922     $self->{column}++;
6923     $self->{nc}
6924     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6925     } else {
6926     $self->{set_nc}->($self);
6927     }
6928    
6929     redo A;
6930     } elsif ($self->{nc} == 0x0029) { # )
6931     $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6932    
6933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6934     $self->{line_prev} = $self->{line};
6935     $self->{column_prev} = $self->{column};
6936     $self->{column}++;
6937     $self->{nc}
6938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6939     } else {
6940     $self->{set_nc}->($self);
6941     }
6942    
6943     redo A;
6944     } elsif ($self->{nc} == 0x003E) { # >
6945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6946     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6947    
6948     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6949     $self->{line_prev} = $self->{line};
6950     $self->{column_prev} = $self->{column};
6951     $self->{column}++;
6952     $self->{nc}
6953     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6954     } else {
6955     $self->{set_nc}->($self);
6956     }
6957    
6958     return ($self->{ct}); # ATTLIST
6959     redo A;
6960     } elsif ($self->{nc} == -1) {
6961     ## XML5: No parse error.
6962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6963     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6964    
6965     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966     $self->{line_prev} = $self->{line};
6967     $self->{column_prev} = $self->{column};
6968     $self->{column}++;
6969     $self->{nc}
6970     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971     } else {
6972     $self->{set_nc}->($self);
6973     }
6974    
6975     return ($self->{ct});
6976     redo A;
6977     } else {
6978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6979     line => $self->{line_prev},
6980     column => $self->{column_prev});
6981     $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6982     $self->{state} = ALLOWED_TOKEN_STATE;
6983    
6984     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6985     $self->{line_prev} = $self->{line};
6986     $self->{column_prev} = $self->{column};
6987     $self->{column}++;
6988     $self->{nc}
6989     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6990     } else {
6991     $self->{set_nc}->($self);
6992     }
6993    
6994     redo A;
6995     }
6996     } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6997     if ($is_space->{$self->{nc}}) {
6998     $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6999    
7000     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7001     $self->{line_prev} = $self->{line};
7002     $self->{column_prev} = $self->{column};
7003     $self->{column}++;
7004     $self->{nc}
7005     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7006     } else {
7007     $self->{set_nc}->($self);
7008     }
7009    
7010     redo A;
7011     } elsif ($self->{nc} == 0x0023) { # #
7012     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7013     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7014    
7015     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7016     $self->{line_prev} = $self->{line};
7017     $self->{column_prev} = $self->{column};
7018     $self->{column}++;
7019     $self->{nc}
7020     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7021     } else {
7022     $self->{set_nc}->($self);
7023     }
7024    
7025     redo A;
7026     } elsif ($self->{nc} == 0x0022) { # "
7027     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7028     $self->{ca}->{value} = '';
7029     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7030    
7031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7032     $self->{line_prev} = $self->{line};
7033     $self->{column_prev} = $self->{column};
7034     $self->{column}++;
7035     $self->{nc}
7036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7037     } else {
7038     $self->{set_nc}->($self);
7039     }
7040    
7041     redo A;
7042     } elsif ($self->{nc} == 0x0027) { # '
7043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7044     $self->{ca}->{value} = '';
7045     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7046    
7047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7048     $self->{line_prev} = $self->{line};
7049     $self->{column_prev} = $self->{column};
7050     $self->{column}++;
7051     $self->{nc}
7052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7053     } else {
7054     $self->{set_nc}->($self);
7055     }
7056    
7057     redo A;
7058     } elsif ($self->{nc} == 0x003E) { # >
7059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7060     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7061    
7062     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063     $self->{line_prev} = $self->{line};
7064     $self->{column_prev} = $self->{column};
7065     $self->{column}++;
7066     $self->{nc}
7067     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068     } else {
7069     $self->{set_nc}->($self);
7070     }
7071    
7072     return ($self->{ct}); # ATTLIST
7073     redo A;
7074     } elsif ($self->{nc} == -1) {
7075     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7076     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7077    
7078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7079     $self->{line_prev} = $self->{line};
7080     $self->{column_prev} = $self->{column};
7081     $self->{column}++;
7082     $self->{nc}
7083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7084     } else {
7085     $self->{set_nc}->($self);
7086     }
7087    
7088     return ($self->{ct});
7089     redo A;
7090     } else {
7091     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7092     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7093     ## Reconsume.
7094     redo A;
7095     }
7096     } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7097     if ($is_space->{$self->{nc}}) {
7098     ## Stay in the state.
7099    
7100     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101     $self->{line_prev} = $self->{line};
7102     $self->{column_prev} = $self->{column};
7103     $self->{column}++;
7104     $self->{nc}
7105     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106     } else {
7107     $self->{set_nc}->($self);
7108     }
7109    
7110     redo A;
7111     } elsif ($self->{nc} == 0x0023) { # #
7112     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7113    
7114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115     $self->{line_prev} = $self->{line};
7116     $self->{column_prev} = $self->{column};
7117     $self->{column}++;
7118     $self->{nc}
7119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120     } else {
7121     $self->{set_nc}->($self);
7122     }
7123    
7124     redo A;
7125     } elsif ($self->{nc} == 0x0022) { # "
7126     $self->{ca}->{value} = '';
7127     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7128    
7129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7130     $self->{line_prev} = $self->{line};
7131     $self->{column_prev} = $self->{column};
7132     $self->{column}++;
7133     $self->{nc}
7134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7135     } else {
7136     $self->{set_nc}->($self);
7137     }
7138    
7139     redo A;
7140     } elsif ($self->{nc} == 0x0027) { # '
7141     $self->{ca}->{value} = '';
7142     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7143    
7144     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7145     $self->{line_prev} = $self->{line};
7146     $self->{column_prev} = $self->{column};
7147     $self->{column}++;
7148     $self->{nc}
7149     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7150     } else {
7151     $self->{set_nc}->($self);
7152     }
7153    
7154     redo A;
7155     } elsif ($self->{nc} == 0x003E) { # >
7156     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7157     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7158    
7159     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7160     $self->{line_prev} = $self->{line};
7161     $self->{column_prev} = $self->{column};
7162     $self->{column}++;
7163     $self->{nc}
7164     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7165     } else {
7166     $self->{set_nc}->($self);
7167     }
7168    
7169     return ($self->{ct}); # ATTLIST
7170     redo A;
7171     } elsif ($self->{nc} == -1) {
7172     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7173     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7174    
7175     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7176     $self->{line_prev} = $self->{line};
7177     $self->{column_prev} = $self->{column};
7178     $self->{column}++;
7179     $self->{nc}
7180     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7181     } else {
7182     $self->{set_nc}->($self);
7183     }
7184    
7185     return ($self->{ct});
7186     redo A;
7187     } else {
7188     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7189     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7190     ## Reconsume.
7191     redo A;
7192     }
7193     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7194     if ($is_space->{$self->{nc}}) {
7195     ## XML5: No parse error.
7196     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7197 wakaba 1.16 $self->{state} = BOGUS_MD_STATE;
7198 wakaba 1.15 ## Reconsume.
7199     redo A;
7200     } elsif ($self->{nc} == 0x0022) { # "
7201     ## XML5: Same as "anything else".
7202     $self->{ca}->{value} = '';
7203     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7204    
7205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7206     $self->{line_prev} = $self->{line};
7207     $self->{column_prev} = $self->{column};
7208     $self->{column}++;
7209     $self->{nc}
7210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7211     } else {
7212     $self->{set_nc}->($self);
7213     }
7214    
7215     redo A;
7216     } elsif ($self->{nc} == 0x0027) { # '
7217     ## XML5: Same as "anything else".
7218     $self->{ca}->{value} = '';
7219     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7220    
7221     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7222     $self->{line_prev} = $self->{line};
7223     $self->{column_prev} = $self->{column};
7224     $self->{column}++;
7225     $self->{nc}
7226     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7227     } else {
7228     $self->{set_nc}->($self);
7229     }
7230    
7231     redo A;
7232     } elsif ($self->{nc} == 0x003E) { # >
7233     ## XML5: Same as "anything else".
7234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7235     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7236    
7237     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7238     $self->{line_prev} = $self->{line};
7239     $self->{column_prev} = $self->{column};
7240     $self->{column}++;
7241     $self->{nc}
7242     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7243     } else {
7244     $self->{set_nc}->($self);
7245     }
7246    
7247     return ($self->{ct}); # ATTLIST
7248     redo A;
7249     } elsif ($self->{nc} == -1) {
7250     ## XML5: No parse error.
7251     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7252     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7253    
7254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7255     $self->{line_prev} = $self->{line};
7256     $self->{column_prev} = $self->{column};
7257     $self->{column}++;
7258     $self->{nc}
7259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7260     } else {
7261     $self->{set_nc}->($self);
7262     }
7263    
7264     return ($self->{ct});
7265     redo A;
7266     } else {
7267     $self->{ca}->{default} = chr $self->{nc};
7268     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7269    
7270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271     $self->{line_prev} = $self->{line};
7272     $self->{column_prev} = $self->{column};
7273     $self->{column}++;
7274     $self->{nc}
7275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276     } else {
7277     $self->{set_nc}->($self);
7278     }
7279    
7280     redo A;
7281     }
7282     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7283     if ($is_space->{$self->{nc}}) {
7284     $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7285    
7286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7287     $self->{line_prev} = $self->{line};
7288     $self->{column_prev} = $self->{column};
7289     $self->{column}++;
7290     $self->{nc}
7291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7292     } else {
7293     $self->{set_nc}->($self);
7294     }
7295    
7296     redo A;
7297     } elsif ($self->{nc} == 0x0022) { # "
7298     ## XML5: Same as "anything else".
7299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7300     $self->{ca}->{value} = '';
7301     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7302    
7303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7304     $self->{line_prev} = $self->{line};
7305     $self->{column_prev} = $self->{column};
7306     $self->{column}++;
7307     $self->{nc}
7308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7309     } else {
7310     $self->{set_nc}->($self);
7311     }
7312    
7313     redo A;
7314     } elsif ($self->{nc} == 0x0027) { # '
7315     ## XML5: Same as "anything else".
7316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7317     $self->{ca}->{value} = '';
7318     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7319    
7320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7321     $self->{line_prev} = $self->{line};
7322     $self->{column_prev} = $self->{column};
7323     $self->{column}++;
7324     $self->{nc}
7325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7326     } else {
7327     $self->{set_nc}->($self);
7328     }
7329    
7330     redo A;
7331     } elsif ($self->{nc} == 0x003E) { # >
7332     ## XML5: Same as "anything else".
7333     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7334     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7335    
7336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7337     $self->{line_prev} = $self->{line};
7338     $self->{column_prev} = $self->{column};
7339     $self->{column}++;
7340     $self->{nc}
7341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7342     } else {
7343     $self->{set_nc}->($self);
7344     }
7345    
7346     return ($self->{ct}); # ATTLIST
7347     redo A;
7348     } elsif ($self->{nc} == -1) {
7349     ## XML5: No parse error.
7350     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7351     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7352     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7353    
7354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355     $self->{line_prev} = $self->{line};
7356     $self->{column_prev} = $self->{column};
7357     $self->{column}++;
7358     $self->{nc}
7359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360     } else {
7361     $self->{set_nc}->($self);
7362     }
7363    
7364     return ($self->{ct});
7365     redo A;
7366     } else {
7367     $self->{ca}->{default} .= chr $self->{nc};
7368     ## Stay in the state.
7369    
7370     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371     $self->{line_prev} = $self->{line};
7372     $self->{column_prev} = $self->{column};
7373     $self->{column}++;
7374     $self->{nc}
7375     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376     } else {
7377     $self->{set_nc}->($self);
7378     }
7379    
7380     redo A;
7381     }
7382     } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7383     if ($is_space->{$self->{nc}}) {
7384     ## Stay in the state.
7385    
7386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7387     $self->{line_prev} = $self->{line};
7388     $self->{column_prev} = $self->{column};
7389     $self->{column}++;
7390     $self->{nc}
7391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7392     } else {
7393     $self->{set_nc}->($self);
7394     }
7395    
7396     redo A;
7397     } elsif ($self->{nc} == 0x0022) { # "
7398     $self->{ca}->{value} = '';
7399     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7400    
7401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402     $self->{line_prev} = $self->{line};
7403     $self->{column_prev} = $self->{column};
7404     $self->{column}++;
7405     $self->{nc}
7406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407     } else {
7408     $self->{set_nc}->($self);
7409     }
7410    
7411     redo A;
7412     } elsif ($self->{nc} == 0x0027) { # '
7413     $self->{ca}->{value} = '';
7414     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7415    
7416     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7417     $self->{line_prev} = $self->{line};
7418     $self->{column_prev} = $self->{column};
7419     $self->{column}++;
7420     $self->{nc}
7421     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7422     } else {
7423     $self->{set_nc}->($self);
7424     }
7425    
7426     redo A;
7427     } elsif ($self->{nc} == 0x003E) { # >
7428     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7429     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7430    
7431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7432     $self->{line_prev} = $self->{line};
7433     $self->{column_prev} = $self->{column};
7434     $self->{column}++;
7435     $self->{nc}
7436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7437     } else {
7438     $self->{set_nc}->($self);
7439     }
7440    
7441     return ($self->{ct}); # ATTLIST
7442     redo A;
7443     } elsif ($self->{nc} == -1) {
7444     ## XML5: No parse error.
7445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7446     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7447     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7448    
7449     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7450     $self->{line_prev} = $self->{line};
7451     $self->{column_prev} = $self->{column};
7452     $self->{column}++;
7453     $self->{nc}
7454     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7455     } else {
7456     $self->{set_nc}->($self);
7457     }
7458    
7459     return ($self->{ct});
7460     redo A;
7461     } else {
7462     ## XML5: Not defined yet.
7463     if ($self->{ca}->{default} eq 'FIXED') {
7464     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7465     } else {
7466     push @{$self->{ct}->{attrdefs}}, $self->{ca};
7467     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7468     }
7469     ## Reconsume.
7470     redo A;
7471     }
7472     } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7473     if ($is_space->{$self->{nc}} or
7474     $self->{nc} == -1 or
7475     $self->{nc} == 0x003E) { # >
7476     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7477     ## Reconsume.
7478     redo A;
7479     } else {
7480     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7481     $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7482     ## Reconsume.
7483     redo A;
7484 wakaba 1.16 }
7485 wakaba 1.18 } elsif ($self->{state} == NDATA_STATE) {
7486     ## ASCII case-insensitive
7487     if ($self->{nc} == [
7488     undef,
7489     0x0044, # D
7490     0x0041, # A
7491     0x0054, # T
7492     ]->[length $self->{kwd}] or
7493     $self->{nc} == [
7494     undef,
7495     0x0064, # d
7496     0x0061, # a
7497     0x0074, # t
7498     ]->[length $self->{kwd}]) {
7499    
7500     ## Stay in the state.
7501     $self->{kwd} .= chr $self->{nc};
7502    
7503     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7504     $self->{line_prev} = $self->{line};
7505     $self->{column_prev} = $self->{column};
7506     $self->{column}++;
7507     $self->{nc}
7508     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7509     } else {
7510     $self->{set_nc}->($self);
7511     }
7512    
7513     redo A;
7514     } elsif ((length $self->{kwd}) == 4 and
7515     ($self->{nc} == 0x0041 or # A
7516     $self->{nc} == 0x0061)) { # a
7517     if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7518    
7519     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7520     text => 'NDATA',
7521     line => $self->{line_prev},
7522     column => $self->{column_prev} - 4);
7523     } else {
7524    
7525     }
7526     $self->{state} = AFTER_NDATA_STATE;
7527    
7528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529     $self->{line_prev} = $self->{line};
7530     $self->{column_prev} = $self->{column};
7531     $self->{column}++;
7532     $self->{nc}
7533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534     } else {
7535     $self->{set_nc}->($self);
7536     }
7537    
7538     redo A;
7539     } else {
7540     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7541     line => $self->{line_prev},
7542     column => $self->{column_prev} + 1
7543     - length $self->{kwd});
7544    
7545     $self->{state} = BOGUS_MD_STATE;
7546     ## Reconsume.
7547     redo A;
7548     }
7549     } elsif ($self->{state} == AFTER_NDATA_STATE) {
7550     if ($is_space->{$self->{nc}}) {
7551     $self->{state} = BEFORE_NOTATION_NAME_STATE;
7552    
7553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7554     $self->{line_prev} = $self->{line};
7555     $self->{column_prev} = $self->{column};
7556     $self->{column}++;
7557     $self->{nc}
7558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7559     } else {
7560     $self->{set_nc}->($self);
7561     }
7562    
7563     redo A;
7564     } elsif ($self->{nc} == 0x003E) { # >
7565     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7566     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7567    
7568     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7569     $self->{line_prev} = $self->{line};
7570     $self->{column_prev} = $self->{column};
7571     $self->{column}++;
7572     $self->{nc}
7573     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7574     } else {
7575     $self->{set_nc}->($self);
7576     }
7577    
7578     return ($self->{ct}); # ENTITY
7579     redo A;
7580     } elsif ($self->{nc} == -1) {
7581     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7582     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7583    
7584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7585     $self->{line_prev} = $self->{line};
7586     $self->{column_prev} = $self->{column};
7587     $self->{column}++;
7588     $self->{nc}
7589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7590     } else {
7591     $self->{set_nc}->($self);
7592     }
7593    
7594     return ($self->{ct}); # ENTITY
7595     redo A;
7596     } else {
7597     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7598     line => $self->{line_prev},
7599     column => $self->{column_prev} + 1
7600     - length $self->{kwd});
7601     $self->{state} = BOGUS_MD_STATE;
7602     ## Reconsume.
7603     redo A;
7604     }
7605     } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7606     if ($is_space->{$self->{nc}}) {
7607     ## Stay in the state.
7608    
7609     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7610     $self->{line_prev} = $self->{line};
7611     $self->{column_prev} = $self->{column};
7612     $self->{column}++;
7613     $self->{nc}
7614     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7615     } else {
7616     $self->{set_nc}->($self);
7617     }
7618    
7619     redo A;
7620     } elsif ($self->{nc} == 0x003E) { # >
7621     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7622     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7623    
7624     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7625     $self->{line_prev} = $self->{line};
7626     $self->{column_prev} = $self->{column};
7627     $self->{column}++;
7628     $self->{nc}
7629     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7630     } else {
7631     $self->{set_nc}->($self);
7632     }
7633    
7634     return ($self->{ct}); # ENTITY
7635     redo A;
7636     } elsif ($self->{nc} == -1) {
7637     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7639    
7640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7641     $self->{line_prev} = $self->{line};
7642     $self->{column_prev} = $self->{column};
7643     $self->{column}++;
7644     $self->{nc}
7645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7646     } else {
7647     $self->{set_nc}->($self);
7648     }
7649    
7650     return ($self->{ct}); # ENTITY
7651     redo A;
7652     } else {
7653     $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7654     $self->{state} = NOTATION_NAME_STATE;
7655    
7656     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7657     $self->{line_prev} = $self->{line};
7658     $self->{column_prev} = $self->{column};
7659     $self->{column}++;
7660     $self->{nc}
7661     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7662     } else {
7663     $self->{set_nc}->($self);
7664     }
7665    
7666     redo A;
7667     }
7668     } elsif ($self->{state} == NOTATION_NAME_STATE) {
7669     if ($is_space->{$self->{nc}}) {
7670     $self->{state} = AFTER_NOTATION_NAME_STATE;
7671    
7672     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7673     $self->{line_prev} = $self->{line};
7674     $self->{column_prev} = $self->{column};
7675     $self->{column}++;
7676     $self->{nc}
7677     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7678     } else {
7679     $self->{set_nc}->($self);
7680     }
7681    
7682     redo A;
7683     } elsif ($self->{nc} == 0x003E) { # >
7684     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7685    
7686     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7687     $self->{line_prev} = $self->{line};
7688     $self->{column_prev} = $self->{column};
7689     $self->{column}++;
7690     $self->{nc}
7691     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7692     } else {
7693     $self->{set_nc}->($self);
7694     }
7695    
7696     return ($self->{ct}); # ENTITY
7697     redo A;
7698     } elsif ($self->{nc} == -1) {
7699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7700     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7701    
7702     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7703     $self->{line_prev} = $self->{line};
7704     $self->{column_prev} = $self->{column};
7705     $self->{column}++;
7706     $self->{nc}
7707     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7708     } else {
7709     $self->{set_nc}->($self);
7710     }
7711    
7712     return ($self->{ct}); # ENTITY
7713     redo A;
7714     } else {
7715     $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7716     ## Stay in the state.
7717    
7718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7719     $self->{line_prev} = $self->{line};
7720     $self->{column_prev} = $self->{column};
7721     $self->{column}++;
7722     $self->{nc}
7723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7724     } else {
7725     $self->{set_nc}->($self);
7726     }
7727    
7728     redo A;
7729     }
7730 wakaba 1.19 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7731     if ($self->{nc} == 0x0022) { # "
7732     $self->{state} = AFTER_NOTATION_NAME_STATE;
7733    
7734     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7735     $self->{line_prev} = $self->{line};
7736     $self->{column_prev} = $self->{column};
7737     $self->{column}++;
7738     $self->{nc}
7739     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7740     } else {
7741     $self->{set_nc}->($self);
7742     }
7743    
7744     redo A;
7745     } elsif ($self->{nc} == 0x0026) { # &
7746     $self->{prev_state} = $self->{state};
7747     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7748     $self->{entity_add} = 0x0022; # "
7749    
7750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7751     $self->{line_prev} = $self->{line};
7752     $self->{column_prev} = $self->{column};
7753     $self->{column}++;
7754     $self->{nc}
7755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7756     } else {
7757     $self->{set_nc}->($self);
7758     }
7759    
7760     redo A;
7761     ## TODO: %
7762     } elsif ($self->{nc} == -1) {
7763     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7764     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7765     ## Reconsume.
7766     return ($self->{ct}); # ENTITY
7767     redo A;
7768     } else {
7769     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7770    
7771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772     $self->{line_prev} = $self->{line};
7773     $self->{column_prev} = $self->{column};
7774     $self->{column}++;
7775     $self->{nc}
7776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777     } else {
7778     $self->{set_nc}->($self);
7779     }
7780    
7781     redo A;
7782     }
7783     } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7784     if ($self->{nc} == 0x0027) { # '
7785     $self->{state} = AFTER_NOTATION_NAME_STATE;
7786    
7787     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788     $self->{line_prev} = $self->{line};
7789     $self->{column_prev} = $self->{column};
7790     $self->{column}++;
7791     $self->{nc}
7792     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793     } else {
7794     $self->{set_nc}->($self);
7795     }
7796    
7797     redo A;
7798     } elsif ($self->{nc} == 0x0026) { # &
7799     $self->{prev_state} = $self->{state};
7800     $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7801     $self->{entity_add} = 0x0027; # '
7802    
7803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804     $self->{line_prev} = $self->{line};
7805     $self->{column_prev} = $self->{column};
7806     $self->{column}++;
7807     $self->{nc}
7808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809     } else {
7810     $self->{set_nc}->($self);
7811     }
7812    
7813     redo A;
7814     ## TODO: %
7815     } elsif ($self->{nc} == -1) {
7816     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7817     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7818     ## Reconsume.
7819     return ($self->{ct}); # ENTITY
7820     redo A;
7821     } else {
7822     $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7823    
7824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7825     $self->{line_prev} = $self->{line};
7826     $self->{column_prev} = $self->{column};
7827     $self->{column}++;
7828     $self->{nc}
7829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7830     } else {
7831     $self->{set_nc}->($self);
7832     }
7833    
7834     redo A;
7835     }
7836     } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7837     ## TODO: XMLize
7838    
7839     if ($is_space->{$self->{nc}} or
7840     {
7841     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7842     $self->{entity_add} => 1,
7843     }->{$self->{nc}}) {
7844     ## Don't consume
7845     ## No error
7846     ## Return nothing.
7847     #
7848     } elsif ($self->{nc} == 0x0023) { # #
7849     $self->{ca} = $self->{ct};
7850     $self->{state} = ENTITY_HASH_STATE;
7851     $self->{kwd} = '#';
7852    
7853     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7854     $self->{line_prev} = $self->{line};
7855     $self->{column_prev} = $self->{column};
7856     $self->{column}++;
7857     $self->{nc}
7858     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7859     } else {
7860     $self->{set_nc}->($self);
7861     }
7862    
7863     redo A;
7864     } elsif ((0x0041 <= $self->{nc} and
7865     $self->{nc} <= 0x005A) or # A..Z
7866     (0x0061 <= $self->{nc} and
7867     $self->{nc} <= 0x007A)) { # a..z
7868     #
7869     } else {
7870     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7871     ## Return nothing.
7872     #
7873     }
7874    
7875     $self->{ct}->{value} .= '&';
7876     $self->{state} = $self->{prev_state};
7877     ## Reconsume.
7878     redo A;
7879 wakaba 1.18 } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7880     if ($is_space->{$self->{nc}}) {
7881     ## Stay in the state.
7882    
7883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7884     $self->{line_prev} = $self->{line};
7885     $self->{column_prev} = $self->{column};
7886     $self->{column}++;
7887     $self->{nc}
7888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7889     } else {
7890     $self->{set_nc}->($self);
7891     }
7892    
7893     redo A;
7894     } elsif ($self->{nc} == 0x003E) { # >
7895     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7896    
7897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7898     $self->{line_prev} = $self->{line};
7899     $self->{column_prev} = $self->{column};
7900     $self->{column}++;
7901     $self->{nc}
7902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7903     } else {
7904     $self->{set_nc}->($self);
7905     }
7906    
7907     return ($self->{ct}); # ENTITY
7908     redo A;
7909     } elsif ($self->{nc} == -1) {
7910     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7911     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7912    
7913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7914     $self->{line_prev} = $self->{line};
7915     $self->{column_prev} = $self->{column};
7916     $self->{column}++;
7917     $self->{nc}
7918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7919     } else {
7920     $self->{set_nc}->($self);
7921     }
7922    
7923     return ($self->{ct}); # ENTITY
7924     redo A;
7925     } else {
7926     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7927     $self->{state} = BOGUS_MD_STATE;
7928     ## Reconsume.
7929     redo A;
7930     }
7931 wakaba 1.16 } elsif ($self->{state} == BOGUS_MD_STATE) {
7932     if ($self->{nc} == 0x003E) { # >
7933     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7934    
7935     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7936     $self->{line_prev} = $self->{line};
7937     $self->{column_prev} = $self->{column};
7938     $self->{column}++;
7939     $self->{nc}
7940     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7941     } else {
7942     $self->{set_nc}->($self);
7943     }
7944    
7945     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7946     redo A;
7947     } elsif ($self->{nc} == -1) {
7948     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7949     ## Reconsume.
7950     return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7951     redo A;
7952     } else {
7953     ## Stay in the state.
7954    
7955     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7956     $self->{line_prev} = $self->{line};
7957     $self->{column_prev} = $self->{column};
7958     $self->{column}++;
7959     $self->{nc}
7960     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7961     } else {
7962     $self->{set_nc}->($self);
7963     }
7964    
7965     redo A;
7966     }
7967 wakaba 1.1 } else {
7968     die "$0: $self->{state}: Unknown state";
7969     }
7970     } # A
7971    
7972     die "$0: _get_next_token: unexpected case";
7973     } # _get_next_token
7974    
7975     1;
7976 wakaba 1.19 ## $Date: 2008/10/19 06:14:57 $
7977 wakaba 1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24