/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.21 - (show annotations) (download)
Sun Oct 19 09:25:21 2008 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.20: +63 -16 lines
++ whatpm/t/ChangeLog	19 Oct 2008 09:24:46 -0000
	* XML-Parser.t: "xml/entrefs-1.dat" added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	19 Oct 2008 09:25:15 -0000
	* charrefs-1.dat: New test data added.

	* entrefs-1.dat: New test data file.

	* attlists-1.dat: Test results updated.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 09:23:24 -0000
	* Tokenizer.pm.src: Make uppercase "&#X" in XML a parse error.
	Remove the limitation of entity name length.  Enable replacement
	of text-only general entities.  Raise a parse error for an
	unparsed entity reference.  Raise a parse error for a general
	entity reference to an undefined entity.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 09:24:32 -0000
	* Parser.pm.src: Define predefined general entities for the
	control of "undeclared entity" error raised by the tokenizer.  Set
	text-only flag to general entities appropriately.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.20 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_DASH_STATE () { 18 }
109 sub BOGUS_COMMENT_STATE () { 19 }
110 sub DOCTYPE_STATE () { 20 }
111 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
112 sub DOCTYPE_NAME_STATE () { 22 }
113 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
114 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
115 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
116 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
117 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
118 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
119 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
120 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
121 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
122 sub BOGUS_DOCTYPE_STATE () { 32 }
123 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
124 sub SELF_CLOSING_START_TAG_STATE () { 34 }
125 sub CDATA_SECTION_STATE () { 35 }
126 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
127 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
128 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
129 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
130 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
131 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
132 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
133 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
134 ## NOTE: "Entity data state", "entity in attribute value state", and
135 ## "consume a character reference" algorithm are jointly implemented
136 ## using the following six states:
137 sub ENTITY_STATE () { 44 }
138 sub ENTITY_HASH_STATE () { 45 }
139 sub NCR_NUM_STATE () { 46 }
140 sub HEXREF_X_STATE () { 47 }
141 sub HEXREF_HEX_STATE () { 48 }
142 sub ENTITY_NAME_STATE () { 49 }
143 sub PCDATA_STATE () { 50 } # "data state" in the spec
144
145 ## XML-only states
146 sub PI_STATE () { 51 }
147 sub PI_TARGET_STATE () { 52 }
148 sub PI_TARGET_AFTER_STATE () { 53 }
149 sub PI_DATA_STATE () { 54 }
150 sub PI_AFTER_STATE () { 55 }
151 sub PI_DATA_AFTER_STATE () { 56 }
152 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155 sub DOCTYPE_TAG_STATE () { 60 }
156 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157 sub MD_ATTLIST_STATE () { 62 }
158 sub MD_E_STATE () { 63 }
159 sub MD_ELEMENT_STATE () { 64 }
160 sub MD_ENTITY_STATE () { 65 }
161 sub MD_NOTATION_STATE () { 66 }
162 sub DOCTYPE_MD_STATE () { 67 }
163 sub BEFORE_MD_NAME_STATE () { 68 }
164 sub MD_NAME_STATE () { 69 }
165 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172 sub ALLOWED_TOKEN_STATE () { 77 }
173 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180 sub BEFORE_NDATA_STATE () { 85 }
181 sub NDATA_STATE () { 86 }
182 sub AFTER_NDATA_STATE () { 87 }
183 sub BEFORE_NOTATION_NAME_STATE () { 88 }
184 sub NOTATION_NAME_STATE () { 89 }
185 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188 sub AFTER_ELEMENT_NAME_STATE () { 93 }
189 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190 sub CONTENT_KEYWORD_STATE () { 95 }
191 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192 sub CM_ELEMENT_NAME_STATE () { 97 }
193 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195 sub AFTER_MD_DEF_STATE () { 100 }
196 sub BOGUS_MD_STATE () { 101 }
197
198 ## Tree constructor state constants (see Whatpm::HTML for the full
199 ## list and descriptions)
200
201 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
202 sub FOREIGN_EL () { 0b1_00000000000 }
203
204 ## Character reference mappings
205
206 my $charref_map = {
207 0x0D => 0x000A,
208 0x80 => 0x20AC,
209 0x81 => 0xFFFD,
210 0x82 => 0x201A,
211 0x83 => 0x0192,
212 0x84 => 0x201E,
213 0x85 => 0x2026,
214 0x86 => 0x2020,
215 0x87 => 0x2021,
216 0x88 => 0x02C6,
217 0x89 => 0x2030,
218 0x8A => 0x0160,
219 0x8B => 0x2039,
220 0x8C => 0x0152,
221 0x8D => 0xFFFD,
222 0x8E => 0x017D,
223 0x8F => 0xFFFD,
224 0x90 => 0xFFFD,
225 0x91 => 0x2018,
226 0x92 => 0x2019,
227 0x93 => 0x201C,
228 0x94 => 0x201D,
229 0x95 => 0x2022,
230 0x96 => 0x2013,
231 0x97 => 0x2014,
232 0x98 => 0x02DC,
233 0x99 => 0x2122,
234 0x9A => 0x0161,
235 0x9B => 0x203A,
236 0x9C => 0x0153,
237 0x9D => 0xFFFD,
238 0x9E => 0x017E,
239 0x9F => 0x0178,
240 }; # $charref_map
241 $charref_map->{$_} = 0xFFFD
242 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
243 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
244 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
245 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
246 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
247 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
248 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
249
250 ## Implementations MUST act as if state machine in the spec
251
252 sub _initialize_tokenizer ($) {
253 my $self = shift;
254
255 ## NOTE: Fields set by |new| constructor:
256 #$self->{level}
257 #$self->{set_nc}
258 #$self->{parse_error}
259 #$self->{is_xml} (if XML)
260
261 $self->{state} = DATA_STATE; # MUST
262 $self->{s_kwd} = ''; # Data state keyword
263 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264 #$self->{entity__value}; # initialized when used
265 #$self->{entity__match}; # initialized when used
266 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
267 undef $self->{ct}; # current token
268 undef $self->{ca}; # current attribute
269 undef $self->{last_stag_name}; # last emitted start tag name
270 #$self->{prev_state}; # initialized when used
271 delete $self->{self_closing};
272 $self->{char_buffer} = '';
273 $self->{char_buffer_pos} = 0;
274 $self->{nc} = -1; # next input character
275 #$self->{next_nc}
276
277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
278 $self->{line_prev} = $self->{line};
279 $self->{column_prev} = $self->{column};
280 $self->{column}++;
281 $self->{nc}
282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
283 } else {
284 $self->{set_nc}->($self);
285 }
286
287 $self->{token} = [];
288 # $self->{escape}
289 } # _initialize_tokenizer
290
291 ## A token has:
292 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294 ## ->{name} (DOCTYPE_TOKEN)
295 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296 ## ->{target} (PI_TOKEN)
297 ## ->{pubid} (DOCTYPE_TOKEN)
298 ## ->{sysid} (DOCTYPE_TOKEN)
299 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
300 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
301 ## ->{name}
302 ## ->{value}
303 ## ->{has_reference} == 1 or 0
304 ## ->{index}: Index of the attribute in a tag.
305 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309
310 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
312 ## while the token is pushed back to the stack.
313
314 ## Emitted token MUST immediately be handled by the tree construction state.
315
316 ## Before each step, UA MAY check to see if either one of the scripts in
317 ## "list of scripts that will execute as soon as possible" or the first
318 ## script in the "list of scripts that will execute asynchronously",
319 ## has completed loading. If one has, then it MUST be executed
320 ## and removed from the list.
321
322 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
323 ## (This requirement was dropped from HTML5 spec, unfortunately.)
324
325 my $is_space = {
326 0x0009 => 1, # CHARACTER TABULATION (HT)
327 0x000A => 1, # LINE FEED (LF)
328 #0x000B => 0, # LINE TABULATION (VT)
329 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330 #0x000D => 1, # CARRIAGE RETURN (CR)
331 0x0020 => 1, # SPACE (SP)
332 };
333
334 sub _get_next_token ($) {
335 my $self = shift;
336
337 if ($self->{self_closing}) {
338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
339 ## NOTE: The |self_closing| flag is only set by start tag token.
340 ## In addition, when a start tag token is emitted, it is always set to
341 ## |ct|.
342 delete $self->{self_closing};
343 }
344
345 if (@{$self->{token}}) {
346 $self->{self_closing} = $self->{token}->[0]->{self_closing};
347 return shift @{$self->{token}};
348 }
349
350 A: {
351 if ($self->{state} == PCDATA_STATE) {
352 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
353
354 if ($self->{nc} == 0x0026) { # &
355
356 ## NOTE: In the spec, the tokenizer is switched to the
357 ## "entity data state". In this implementation, the tokenizer
358 ## is switched to the |ENTITY_STATE|, which is an implementation
359 ## of the "consume a character reference" algorithm.
360 $self->{entity_add} = -1;
361 $self->{prev_state} = DATA_STATE;
362 $self->{state} = ENTITY_STATE;
363
364 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
365 $self->{line_prev} = $self->{line};
366 $self->{column_prev} = $self->{column};
367 $self->{column}++;
368 $self->{nc}
369 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
370 } else {
371 $self->{set_nc}->($self);
372 }
373
374 redo A;
375 } elsif ($self->{nc} == 0x003C) { # <
376
377 $self->{state} = TAG_OPEN_STATE;
378
379 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
380 $self->{line_prev} = $self->{line};
381 $self->{column_prev} = $self->{column};
382 $self->{column}++;
383 $self->{nc}
384 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
385 } else {
386 $self->{set_nc}->($self);
387 }
388
389 redo A;
390 } elsif ($self->{nc} == -1) {
391
392 return ({type => END_OF_FILE_TOKEN,
393 line => $self->{line}, column => $self->{column}});
394 last A; ## TODO: ok?
395 } else {
396
397 #
398 }
399
400 # Anything else
401 my $token = {type => CHARACTER_TOKEN,
402 data => chr $self->{nc},
403 line => $self->{line}, column => $self->{column},
404 };
405 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
406
407 ## Stay in the state.
408
409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
410 $self->{line_prev} = $self->{line};
411 $self->{column_prev} = $self->{column};
412 $self->{column}++;
413 $self->{nc}
414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
415 } else {
416 $self->{set_nc}->($self);
417 }
418
419 return ($token);
420 redo A;
421 } elsif ($self->{state} == DATA_STATE) {
422 $self->{s_kwd} = '' unless defined $self->{s_kwd};
423 if ($self->{nc} == 0x0026) { # &
424 $self->{s_kwd} = '';
425 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
426 not $self->{escape}) {
427
428 ## NOTE: In the spec, the tokenizer is switched to the
429 ## "entity data state". In this implementation, the tokenizer
430 ## is switched to the |ENTITY_STATE|, which is an implementation
431 ## of the "consume a character reference" algorithm.
432 $self->{entity_add} = -1;
433 $self->{prev_state} = DATA_STATE;
434 $self->{state} = ENTITY_STATE;
435
436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
437 $self->{line_prev} = $self->{line};
438 $self->{column_prev} = $self->{column};
439 $self->{column}++;
440 $self->{nc}
441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
442 } else {
443 $self->{set_nc}->($self);
444 }
445
446 redo A;
447 } else {
448
449 #
450 }
451 } elsif ($self->{nc} == 0x002D) { # -
452 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453 if ($self->{s_kwd} eq '<!-') {
454
455 $self->{escape} = 1; # unless $self->{escape};
456 $self->{s_kwd} = '--';
457 #
458 } elsif ($self->{s_kwd} eq '-') {
459
460 $self->{s_kwd} = '--';
461 #
462 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463
464 $self->{s_kwd} .= '-';
465 #
466 } else {
467
468 $self->{s_kwd} = '-';
469 #
470 }
471 }
472
473 #
474 } elsif ($self->{nc} == 0x0021) { # !
475 if (length $self->{s_kwd}) {
476
477 $self->{s_kwd} .= '!';
478 #
479 } else {
480
481 #$self->{s_kwd} = '';
482 #
483 }
484 #
485 } elsif ($self->{nc} == 0x003C) { # <
486 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
487 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
488 not $self->{escape})) {
489
490 $self->{state} = TAG_OPEN_STATE;
491
492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
493 $self->{line_prev} = $self->{line};
494 $self->{column_prev} = $self->{column};
495 $self->{column}++;
496 $self->{nc}
497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
498 } else {
499 $self->{set_nc}->($self);
500 }
501
502 redo A;
503 } else {
504
505 $self->{s_kwd} = '';
506 #
507 }
508 } elsif ($self->{nc} == 0x003E) { # >
509 if ($self->{escape} and
510 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
511 if ($self->{s_kwd} eq '--') {
512
513 delete $self->{escape};
514 #
515 } else {
516
517 #
518 }
519 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522 line => $self->{line_prev},
523 column => $self->{column_prev} - 1);
524 #
525 } else {
526
527 #
528 }
529
530 $self->{s_kwd} = '';
531 #
532 } elsif ($self->{nc} == 0x005D) { # ]
533 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534
535 $self->{s_kwd} .= ']';
536 } elsif ($self->{s_kwd} eq ']]') {
537
538 #
539 } else {
540
541 $self->{s_kwd} = '';
542 }
543 #
544 } elsif ($self->{nc} == -1) {
545
546 $self->{s_kwd} = '';
547 return ({type => END_OF_FILE_TOKEN,
548 line => $self->{line}, column => $self->{column}});
549 last A; ## TODO: ok?
550 } else {
551
552 $self->{s_kwd} = '';
553 #
554 }
555
556 # Anything else
557 my $token = {type => CHARACTER_TOKEN,
558 data => chr $self->{nc},
559 line => $self->{line}, column => $self->{column},
560 };
561 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562 length $token->{data})) {
563 $self->{s_kwd} = '';
564 }
565
566 ## Stay in the data state.
567 if (not $self->{is_xml} and
568 $self->{content_model} == PCDATA_CONTENT_MODEL) {
569
570 $self->{state} = PCDATA_STATE;
571 } else {
572
573 ## Stay in the state.
574 }
575
576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
577 $self->{line_prev} = $self->{line};
578 $self->{column_prev} = $self->{column};
579 $self->{column}++;
580 $self->{nc}
581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
582 } else {
583 $self->{set_nc}->($self);
584 }
585
586 return ($token);
587 redo A;
588 } elsif ($self->{state} == TAG_OPEN_STATE) {
589 ## XML5: "tag state".
590
591 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592 if ($self->{nc} == 0x002F) { # /
593
594
595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
596 $self->{line_prev} = $self->{line};
597 $self->{column_prev} = $self->{column};
598 $self->{column}++;
599 $self->{nc}
600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
601 } else {
602 $self->{set_nc}->($self);
603 }
604
605 $self->{state} = CLOSE_TAG_OPEN_STATE;
606 redo A;
607 } elsif ($self->{nc} == 0x0021) { # !
608
609 $self->{s_kwd} = $self->{escaped} ? '' : '<';
610 #
611 } else {
612
613 $self->{s_kwd} = '';
614 #
615 }
616
617 ## reconsume
618 $self->{state} = DATA_STATE;
619 return ({type => CHARACTER_TOKEN, data => '<',
620 line => $self->{line_prev},
621 column => $self->{column_prev},
622 });
623 redo A;
624 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
625 if ($self->{nc} == 0x0021) { # !
626
627 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
628
629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
630 $self->{line_prev} = $self->{line};
631 $self->{column_prev} = $self->{column};
632 $self->{column}++;
633 $self->{nc}
634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
635 } else {
636 $self->{set_nc}->($self);
637 }
638
639 redo A;
640 } elsif ($self->{nc} == 0x002F) { # /
641
642 $self->{state} = CLOSE_TAG_OPEN_STATE;
643
644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
645 $self->{line_prev} = $self->{line};
646 $self->{column_prev} = $self->{column};
647 $self->{column}++;
648 $self->{nc}
649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
650 } else {
651 $self->{set_nc}->($self);
652 }
653
654 redo A;
655 } elsif (0x0041 <= $self->{nc} and
656 $self->{nc} <= 0x005A) { # A..Z
657
658 $self->{ct}
659 = {type => START_TAG_TOKEN,
660 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661 line => $self->{line_prev},
662 column => $self->{column_prev}};
663 $self->{state} = TAG_NAME_STATE;
664
665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666 $self->{line_prev} = $self->{line};
667 $self->{column_prev} = $self->{column};
668 $self->{column}++;
669 $self->{nc}
670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671 } else {
672 $self->{set_nc}->($self);
673 }
674
675 redo A;
676 } elsif (0x0061 <= $self->{nc} and
677 $self->{nc} <= 0x007A) { # a..z
678
679 $self->{ct} = {type => START_TAG_TOKEN,
680 tag_name => chr ($self->{nc}),
681 line => $self->{line_prev},
682 column => $self->{column_prev}};
683 $self->{state} = TAG_NAME_STATE;
684
685 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
686 $self->{line_prev} = $self->{line};
687 $self->{column_prev} = $self->{column};
688 $self->{column}++;
689 $self->{nc}
690 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
691 } else {
692 $self->{set_nc}->($self);
693 }
694
695 redo A;
696 } elsif ($self->{nc} == 0x003E) { # >
697
698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
699 line => $self->{line_prev},
700 column => $self->{column_prev});
701 $self->{state} = DATA_STATE;
702 $self->{s_kwd} = '';
703
704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705 $self->{line_prev} = $self->{line};
706 $self->{column_prev} = $self->{column};
707 $self->{column}++;
708 $self->{nc}
709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710 } else {
711 $self->{set_nc}->($self);
712 }
713
714
715 return ({type => CHARACTER_TOKEN, data => '<>',
716 line => $self->{line_prev},
717 column => $self->{column_prev},
718 });
719
720 redo A;
721 } elsif ($self->{nc} == 0x003F) { # ?
722 if ($self->{is_xml}) {
723
724 $self->{state} = PI_STATE;
725
726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727 $self->{line_prev} = $self->{line};
728 $self->{column_prev} = $self->{column};
729 $self->{column}++;
730 $self->{nc}
731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732 } else {
733 $self->{set_nc}->($self);
734 }
735
736 redo A;
737 } else {
738
739 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = BOGUS_COMMENT_STATE;
743 $self->{ct} = {type => COMMENT_TOKEN, data => '',
744 line => $self->{line_prev},
745 column => $self->{column_prev},
746 };
747 ## $self->{nc} is intentionally left as is
748 redo A;
749 }
750 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751
752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753 line => $self->{line_prev},
754 column => $self->{column_prev});
755 $self->{state} = DATA_STATE;
756 $self->{s_kwd} = '';
757 ## reconsume
758
759 return ({type => CHARACTER_TOKEN, data => '<',
760 line => $self->{line_prev},
761 column => $self->{column_prev},
762 });
763
764 redo A;
765 } else {
766 ## XML5: "<:" is a parse error.
767
768 $self->{ct} = {type => START_TAG_TOKEN,
769 tag_name => chr ($self->{nc}),
770 line => $self->{line_prev},
771 column => $self->{column_prev}};
772 $self->{state} = TAG_NAME_STATE;
773
774 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775 $self->{line_prev} = $self->{line};
776 $self->{column_prev} = $self->{column};
777 $self->{column}++;
778 $self->{nc}
779 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780 } else {
781 $self->{set_nc}->($self);
782 }
783
784 redo A;
785 }
786 } else {
787 die "$0: $self->{content_model} in tag open";
788 }
789 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
790 ## NOTE: The "close tag open state" in the spec is implemented as
791 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792
793 ## XML5: "end tag state".
794
795 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797 if (defined $self->{last_stag_name}) {
798 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799 $self->{kwd} = '';
800 ## Reconsume.
801 redo A;
802 } else {
803 ## No start tag token has ever been emitted
804 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805
806 $self->{state} = DATA_STATE;
807 $self->{s_kwd} = '';
808 ## Reconsume.
809 return ({type => CHARACTER_TOKEN, data => '</',
810 line => $l, column => $c,
811 });
812 redo A;
813 }
814 }
815
816 if (0x0041 <= $self->{nc} and
817 $self->{nc} <= 0x005A) { # A..Z
818
819 $self->{ct}
820 = {type => END_TAG_TOKEN,
821 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822 line => $l, column => $c};
823 $self->{state} = TAG_NAME_STATE;
824
825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
826 $self->{line_prev} = $self->{line};
827 $self->{column_prev} = $self->{column};
828 $self->{column}++;
829 $self->{nc}
830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
831 } else {
832 $self->{set_nc}->($self);
833 }
834
835 redo A;
836 } elsif (0x0061 <= $self->{nc} and
837 $self->{nc} <= 0x007A) { # a..z
838
839 $self->{ct} = {type => END_TAG_TOKEN,
840 tag_name => chr ($self->{nc}),
841 line => $l, column => $c};
842 $self->{state} = TAG_NAME_STATE;
843
844 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
845 $self->{line_prev} = $self->{line};
846 $self->{column_prev} = $self->{column};
847 $self->{column}++;
848 $self->{nc}
849 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
850 } else {
851 $self->{set_nc}->($self);
852 }
853
854 redo A;
855 } elsif ($self->{nc} == 0x003E) { # >
856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857 line => $self->{line_prev}, ## "<" in "</>"
858 column => $self->{column_prev} - 1);
859 $self->{state} = DATA_STATE;
860 $self->{s_kwd} = '';
861 if ($self->{is_xml}) {
862
863 ## XML5: No parse error.
864
865 ## NOTE: This parser raises a parse error, since it supports
866 ## XML1, not XML5.
867
868 ## NOTE: A short end tag token.
869 my $ct = {type => END_TAG_TOKEN,
870 tag_name => '',
871 line => $self->{line_prev},
872 column => $self->{column_prev} - 1,
873 };
874
875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876 $self->{line_prev} = $self->{line};
877 $self->{column_prev} = $self->{column};
878 $self->{column}++;
879 $self->{nc}
880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
881 } else {
882 $self->{set_nc}->($self);
883 }
884
885 return ($ct);
886 } else {
887
888
889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890 $self->{line_prev} = $self->{line};
891 $self->{column_prev} = $self->{column};
892 $self->{column}++;
893 $self->{nc}
894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895 } else {
896 $self->{set_nc}->($self);
897 }
898
899 }
900 redo A;
901 } elsif ($self->{nc} == -1) {
902
903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904 $self->{s_kwd} = '';
905 $self->{state} = DATA_STATE;
906 # reconsume
907
908 return ({type => CHARACTER_TOKEN, data => '</',
909 line => $l, column => $c,
910 });
911
912 redo A;
913 } elsif (not $self->{is_xml} or
914 $is_space->{$self->{nc}}) {
915
916 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917 line => $self->{line_prev}, # "<" of "</"
918 column => $self->{column_prev} - 1);
919 $self->{state} = BOGUS_COMMENT_STATE;
920 $self->{ct} = {type => COMMENT_TOKEN, data => '',
921 line => $self->{line_prev}, # "<" of "</"
922 column => $self->{column_prev} - 1,
923 };
924 ## NOTE: $self->{nc} is intentionally left as is.
925 ## Although the "anything else" case of the spec not explicitly
926 ## states that the next input character is to be reconsumed,
927 ## it will be included to the |data| of the comment token
928 ## generated from the bogus end tag, as defined in the
929 ## "bogus comment state" entry.
930 redo A;
931 } else {
932 ## XML5: "</:" is a parse error.
933
934 $self->{ct} = {type => END_TAG_TOKEN,
935 tag_name => chr ($self->{nc}),
936 line => $l, column => $c};
937 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938
939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940 $self->{line_prev} = $self->{line};
941 $self->{column_prev} = $self->{column};
942 $self->{column}++;
943 $self->{nc}
944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945 } else {
946 $self->{set_nc}->($self);
947 }
948
949 redo A;
950 }
951 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953 if (length $ch) {
954 my $CH = $ch;
955 $ch =~ tr/a-z/A-Z/;
956 my $nch = chr $self->{nc};
957 if ($nch eq $ch or $nch eq $CH) {
958
959 ## Stay in the state.
960 $self->{kwd} .= $nch;
961
962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963 $self->{line_prev} = $self->{line};
964 $self->{column_prev} = $self->{column};
965 $self->{column}++;
966 $self->{nc}
967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968 } else {
969 $self->{set_nc}->($self);
970 }
971
972 redo A;
973 } else {
974
975 $self->{state} = DATA_STATE;
976 $self->{s_kwd} = '';
977 ## Reconsume.
978 return ({type => CHARACTER_TOKEN,
979 data => '</' . $self->{kwd},
980 line => $self->{line_prev},
981 column => $self->{column_prev} - 1 - length $self->{kwd},
982 });
983 redo A;
984 }
985 } else { # after "<{tag-name}"
986 unless ($is_space->{$self->{nc}} or
987 {
988 0x003E => 1, # >
989 0x002F => 1, # /
990 -1 => 1, # EOF
991 }->{$self->{nc}}) {
992
993 ## Reconsume.
994 $self->{state} = DATA_STATE;
995 $self->{s_kwd} = '';
996 return ({type => CHARACTER_TOKEN,
997 data => '</' . $self->{kwd},
998 line => $self->{line_prev},
999 column => $self->{column_prev} - 1 - length $self->{kwd},
1000 });
1001 redo A;
1002 } else {
1003
1004 $self->{ct}
1005 = {type => END_TAG_TOKEN,
1006 tag_name => $self->{last_stag_name},
1007 line => $self->{line_prev},
1008 column => $self->{column_prev} - 1 - length $self->{kwd}};
1009 $self->{state} = TAG_NAME_STATE;
1010 ## Reconsume.
1011 redo A;
1012 }
1013 }
1014 } elsif ($self->{state} == TAG_NAME_STATE) {
1015 if ($is_space->{$self->{nc}}) {
1016
1017 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1018
1019 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1020 $self->{line_prev} = $self->{line};
1021 $self->{column_prev} = $self->{column};
1022 $self->{column}++;
1023 $self->{nc}
1024 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1025 } else {
1026 $self->{set_nc}->($self);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{nc} == 0x003E) { # >
1031 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1032
1033 $self->{last_stag_name} = $self->{ct}->{tag_name};
1034 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1035 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1036 #if ($self->{ct}->{attributes}) {
1037 # ## NOTE: This should never be reached.
1038 # !!! cp (36);
1039 # !!! parse-error (type => 'end tag attribute');
1040 #} else {
1041
1042 #}
1043 } else {
1044 die "$0: $self->{ct}->{type}: Unknown token type";
1045 }
1046 $self->{state} = DATA_STATE;
1047 $self->{s_kwd} = '';
1048
1049 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050 $self->{line_prev} = $self->{line};
1051 $self->{column_prev} = $self->{column};
1052 $self->{column}++;
1053 $self->{nc}
1054 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1055 } else {
1056 $self->{set_nc}->($self);
1057 }
1058
1059
1060 return ($self->{ct}); # start tag or end tag
1061
1062 redo A;
1063 } elsif (0x0041 <= $self->{nc} and
1064 $self->{nc} <= 0x005A) { # A..Z
1065
1066 $self->{ct}->{tag_name}
1067 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068 # start tag or end tag
1069 ## Stay in this state
1070
1071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1072 $self->{line_prev} = $self->{line};
1073 $self->{column_prev} = $self->{column};
1074 $self->{column}++;
1075 $self->{nc}
1076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1077 } else {
1078 $self->{set_nc}->($self);
1079 }
1080
1081 redo A;
1082 } elsif ($self->{nc} == -1) {
1083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1084 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1085
1086 $self->{last_stag_name} = $self->{ct}->{tag_name};
1087 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1088 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1089 #if ($self->{ct}->{attributes}) {
1090 # ## NOTE: This state should never be reached.
1091 # !!! cp (40);
1092 # !!! parse-error (type => 'end tag attribute');
1093 #} else {
1094
1095 #}
1096 } else {
1097 die "$0: $self->{ct}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 $self->{s_kwd} = '';
1101 # reconsume
1102
1103 return ($self->{ct}); # start tag or end tag
1104
1105 redo A;
1106 } elsif ($self->{nc} == 0x002F) { # /
1107
1108 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1109
1110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111 $self->{line_prev} = $self->{line};
1112 $self->{column_prev} = $self->{column};
1113 $self->{column}++;
1114 $self->{nc}
1115 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116 } else {
1117 $self->{set_nc}->($self);
1118 }
1119
1120 redo A;
1121 } else {
1122
1123 $self->{ct}->{tag_name} .= chr $self->{nc};
1124 # start tag or end tag
1125 ## Stay in the state
1126
1127 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1128 $self->{line_prev} = $self->{line};
1129 $self->{column_prev} = $self->{column};
1130 $self->{column}++;
1131 $self->{nc}
1132 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1133 } else {
1134 $self->{set_nc}->($self);
1135 }
1136
1137 redo A;
1138 }
1139 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140 ## XML5: "Tag attribute name before state".
1141
1142 if ($is_space->{$self->{nc}}) {
1143
1144 ## Stay in the state
1145
1146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1147 $self->{line_prev} = $self->{line};
1148 $self->{column_prev} = $self->{column};
1149 $self->{column}++;
1150 $self->{nc}
1151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1152 } else {
1153 $self->{set_nc}->($self);
1154 }
1155
1156 redo A;
1157 } elsif ($self->{nc} == 0x003E) { # >
1158 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1159
1160 $self->{last_stag_name} = $self->{ct}->{tag_name};
1161 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1162 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1163 if ($self->{ct}->{attributes}) {
1164
1165 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1166 } else {
1167
1168 }
1169 } else {
1170 die "$0: $self->{ct}->{type}: Unknown token type";
1171 }
1172 $self->{state} = DATA_STATE;
1173 $self->{s_kwd} = '';
1174
1175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176 $self->{line_prev} = $self->{line};
1177 $self->{column_prev} = $self->{column};
1178 $self->{column}++;
1179 $self->{nc}
1180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1181 } else {
1182 $self->{set_nc}->($self);
1183 }
1184
1185
1186 return ($self->{ct}); # start tag or end tag
1187
1188 redo A;
1189 } elsif (0x0041 <= $self->{nc} and
1190 $self->{nc} <= 0x005A) { # A..Z
1191
1192 $self->{ca}
1193 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194 value => '',
1195 line => $self->{line}, column => $self->{column}};
1196 $self->{state} = ATTRIBUTE_NAME_STATE;
1197
1198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1199 $self->{line_prev} = $self->{line};
1200 $self->{column_prev} = $self->{column};
1201 $self->{column}++;
1202 $self->{nc}
1203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1204 } else {
1205 $self->{set_nc}->($self);
1206 }
1207
1208 redo A;
1209 } elsif ($self->{nc} == 0x002F) { # /
1210
1211 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1212
1213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1214 $self->{line_prev} = $self->{line};
1215 $self->{column_prev} = $self->{column};
1216 $self->{column}++;
1217 $self->{nc}
1218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1219 } else {
1220 $self->{set_nc}->($self);
1221 }
1222
1223 redo A;
1224 } elsif ($self->{nc} == -1) {
1225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1226 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1227
1228 $self->{last_stag_name} = $self->{ct}->{tag_name};
1229 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1230 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1231 if ($self->{ct}->{attributes}) {
1232
1233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1234 } else {
1235
1236 }
1237 } else {
1238 die "$0: $self->{ct}->{type}: Unknown token type";
1239 }
1240 $self->{state} = DATA_STATE;
1241 $self->{s_kwd} = '';
1242 # reconsume
1243
1244 return ($self->{ct}); # start tag or end tag
1245
1246 redo A;
1247 } else {
1248 if ({
1249 0x0022 => 1, # "
1250 0x0027 => 1, # '
1251 0x003D => 1, # =
1252 }->{$self->{nc}}) {
1253
1254 ## XML5: Not a parse error.
1255 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256 } else {
1257
1258 ## XML5: ":" raises a parse error and is ignored.
1259 }
1260 $self->{ca}
1261 = {name => chr ($self->{nc}),
1262 value => '',
1263 line => $self->{line}, column => $self->{column}};
1264 $self->{state} = ATTRIBUTE_NAME_STATE;
1265
1266 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1267 $self->{line_prev} = $self->{line};
1268 $self->{column_prev} = $self->{column};
1269 $self->{column}++;
1270 $self->{nc}
1271 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1272 } else {
1273 $self->{set_nc}->($self);
1274 }
1275
1276 redo A;
1277 }
1278 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279 ## XML5: "Tag attribute name state".
1280
1281 my $before_leave = sub {
1282 if (exists $self->{ct}->{attributes} # start tag or end tag
1283 ->{$self->{ca}->{name}}) { # MUST
1284
1285 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1286 ## Discard $self->{ca} # MUST
1287 } else {
1288
1289 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290 = $self->{ca};
1291 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292 }
1293 }; # $before_leave
1294
1295 if ($is_space->{$self->{nc}}) {
1296
1297 $before_leave->();
1298 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1299
1300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1301 $self->{line_prev} = $self->{line};
1302 $self->{column_prev} = $self->{column};
1303 $self->{column}++;
1304 $self->{nc}
1305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1306 } else {
1307 $self->{set_nc}->($self);
1308 }
1309
1310 redo A;
1311 } elsif ($self->{nc} == 0x003D) { # =
1312
1313 $before_leave->();
1314 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1315
1316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1317 $self->{line_prev} = $self->{line};
1318 $self->{column_prev} = $self->{column};
1319 $self->{column}++;
1320 $self->{nc}
1321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1322 } else {
1323 $self->{set_nc}->($self);
1324 }
1325
1326 redo A;
1327 } elsif ($self->{nc} == 0x003E) { # >
1328 if ($self->{is_xml}) {
1329
1330 ## XML5: Not a parse error.
1331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332 } else {
1333
1334 }
1335
1336 $before_leave->();
1337 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338
1339 $self->{last_stag_name} = $self->{ct}->{tag_name};
1340 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1341
1342 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1343 if ($self->{ct}->{attributes}) {
1344 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1345 }
1346 } else {
1347 die "$0: $self->{ct}->{type}: Unknown token type";
1348 }
1349 $self->{state} = DATA_STATE;
1350 $self->{s_kwd} = '';
1351
1352 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353 $self->{line_prev} = $self->{line};
1354 $self->{column_prev} = $self->{column};
1355 $self->{column}++;
1356 $self->{nc}
1357 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1358 } else {
1359 $self->{set_nc}->($self);
1360 }
1361
1362
1363 return ($self->{ct}); # start tag or end tag
1364
1365 redo A;
1366 } elsif (0x0041 <= $self->{nc} and
1367 $self->{nc} <= 0x005A) { # A..Z
1368
1369 $self->{ca}->{name}
1370 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371 ## Stay in the state
1372
1373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1374 $self->{line_prev} = $self->{line};
1375 $self->{column_prev} = $self->{column};
1376 $self->{column}++;
1377 $self->{nc}
1378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1379 } else {
1380 $self->{set_nc}->($self);
1381 }
1382
1383 redo A;
1384 } elsif ($self->{nc} == 0x002F) { # /
1385 if ($self->{is_xml}) {
1386
1387 ## XML5: Not a parse error.
1388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389 } else {
1390
1391 }
1392
1393 $before_leave->();
1394 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395
1396 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1397 $self->{line_prev} = $self->{line};
1398 $self->{column_prev} = $self->{column};
1399 $self->{column}++;
1400 $self->{nc}
1401 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1402 } else {
1403 $self->{set_nc}->($self);
1404 }
1405
1406 redo A;
1407 } elsif ($self->{nc} == -1) {
1408 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1409 $before_leave->();
1410 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411
1412 $self->{last_stag_name} = $self->{ct}->{tag_name};
1413 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415 if ($self->{ct}->{attributes}) {
1416
1417 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1418 } else {
1419 ## NOTE: This state should never be reached.
1420
1421 }
1422 } else {
1423 die "$0: $self->{ct}->{type}: Unknown token type";
1424 }
1425 $self->{state} = DATA_STATE;
1426 $self->{s_kwd} = '';
1427 # reconsume
1428
1429 return ($self->{ct}); # start tag or end tag
1430
1431 redo A;
1432 } else {
1433 if ($self->{nc} == 0x0022 or # "
1434 $self->{nc} == 0x0027) { # '
1435
1436 ## XML5: Not a parse error.
1437 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438 } else {
1439
1440 }
1441 $self->{ca}->{name} .= chr ($self->{nc});
1442 ## Stay in the state
1443
1444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445 $self->{line_prev} = $self->{line};
1446 $self->{column_prev} = $self->{column};
1447 $self->{column}++;
1448 $self->{nc}
1449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450 } else {
1451 $self->{set_nc}->($self);
1452 }
1453
1454 redo A;
1455 }
1456 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457 ## XML5: "Tag attribute name after state".
1458
1459 if ($is_space->{$self->{nc}}) {
1460
1461 ## Stay in the state
1462
1463 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1464 $self->{line_prev} = $self->{line};
1465 $self->{column_prev} = $self->{column};
1466 $self->{column}++;
1467 $self->{nc}
1468 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1469 } else {
1470 $self->{set_nc}->($self);
1471 }
1472
1473 redo A;
1474 } elsif ($self->{nc} == 0x003D) { # =
1475
1476 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1477
1478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479 $self->{line_prev} = $self->{line};
1480 $self->{column_prev} = $self->{column};
1481 $self->{column}++;
1482 $self->{nc}
1483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484 } else {
1485 $self->{set_nc}->($self);
1486 }
1487
1488 redo A;
1489 } elsif ($self->{nc} == 0x003E) { # >
1490 if ($self->{is_xml}) {
1491
1492 ## XML5: Not a parse error.
1493 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494 } else {
1495
1496 }
1497
1498 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499
1500 $self->{last_stag_name} = $self->{ct}->{tag_name};
1501 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1502 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1503 if ($self->{ct}->{attributes}) {
1504
1505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1506 } else {
1507 ## NOTE: This state should never be reached.
1508
1509 }
1510 } else {
1511 die "$0: $self->{ct}->{type}: Unknown token type";
1512 }
1513 $self->{state} = DATA_STATE;
1514 $self->{s_kwd} = '';
1515
1516 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517 $self->{line_prev} = $self->{line};
1518 $self->{column_prev} = $self->{column};
1519 $self->{column}++;
1520 $self->{nc}
1521 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522 } else {
1523 $self->{set_nc}->($self);
1524 }
1525
1526
1527 return ($self->{ct}); # start tag or end tag
1528
1529 redo A;
1530 } elsif (0x0041 <= $self->{nc} and
1531 $self->{nc} <= 0x005A) { # A..Z
1532
1533 $self->{ca}
1534 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535 value => '',
1536 line => $self->{line}, column => $self->{column}};
1537 $self->{state} = ATTRIBUTE_NAME_STATE;
1538
1539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1540 $self->{line_prev} = $self->{line};
1541 $self->{column_prev} = $self->{column};
1542 $self->{column}++;
1543 $self->{nc}
1544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1545 } else {
1546 $self->{set_nc}->($self);
1547 }
1548
1549 redo A;
1550 } elsif ($self->{nc} == 0x002F) { # /
1551 if ($self->{is_xml}) {
1552
1553 ## XML5: Not a parse error.
1554 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555 } else {
1556
1557 }
1558
1559 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560
1561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1562 $self->{line_prev} = $self->{line};
1563 $self->{column_prev} = $self->{column};
1564 $self->{column}++;
1565 $self->{nc}
1566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1567 } else {
1568 $self->{set_nc}->($self);
1569 }
1570
1571 redo A;
1572 } elsif ($self->{nc} == -1) {
1573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1574 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1575
1576 $self->{last_stag_name} = $self->{ct}->{tag_name};
1577 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1578 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1579 if ($self->{ct}->{attributes}) {
1580
1581 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1582 } else {
1583 ## NOTE: This state should never be reached.
1584
1585 }
1586 } else {
1587 die "$0: $self->{ct}->{type}: Unknown token type";
1588 }
1589 $self->{s_kwd} = '';
1590 $self->{state} = DATA_STATE;
1591 # reconsume
1592
1593 return ($self->{ct}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597 if ($self->{is_xml}) {
1598
1599 ## XML5: Not a parse error.
1600 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601 } else {
1602
1603 }
1604
1605 if ($self->{nc} == 0x0022 or # "
1606 $self->{nc} == 0x0027) { # '
1607
1608 ## XML5: Not a parse error.
1609 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610 } else {
1611
1612 }
1613 $self->{ca}
1614 = {name => chr ($self->{nc}),
1615 value => '',
1616 line => $self->{line}, column => $self->{column}};
1617 $self->{state} = ATTRIBUTE_NAME_STATE;
1618
1619 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1620 $self->{line_prev} = $self->{line};
1621 $self->{column_prev} = $self->{column};
1622 $self->{column}++;
1623 $self->{nc}
1624 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1625 } else {
1626 $self->{set_nc}->($self);
1627 }
1628
1629 redo A;
1630 }
1631 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632 ## XML5: "Tag attribute value before state".
1633
1634 if ($is_space->{$self->{nc}}) {
1635
1636 ## Stay in the state
1637
1638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1639 $self->{line_prev} = $self->{line};
1640 $self->{column_prev} = $self->{column};
1641 $self->{column}++;
1642 $self->{nc}
1643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1644 } else {
1645 $self->{set_nc}->($self);
1646 }
1647
1648 redo A;
1649 } elsif ($self->{nc} == 0x0022) { # "
1650
1651 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1652
1653 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1654 $self->{line_prev} = $self->{line};
1655 $self->{column_prev} = $self->{column};
1656 $self->{column}++;
1657 $self->{nc}
1658 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1659 } else {
1660 $self->{set_nc}->($self);
1661 }
1662
1663 redo A;
1664 } elsif ($self->{nc} == 0x0026) { # &
1665
1666 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1667 ## reconsume
1668 redo A;
1669 } elsif ($self->{nc} == 0x0027) { # '
1670
1671 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1672
1673 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674 $self->{line_prev} = $self->{line};
1675 $self->{column_prev} = $self->{column};
1676 $self->{column}++;
1677 $self->{nc}
1678 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679 } else {
1680 $self->{set_nc}->($self);
1681 }
1682
1683 redo A;
1684 } elsif ($self->{nc} == 0x003E) { # >
1685 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1686 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1687
1688 $self->{last_stag_name} = $self->{ct}->{tag_name};
1689 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1690 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1691 if ($self->{ct}->{attributes}) {
1692
1693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1694 } else {
1695 ## NOTE: This state should never be reached.
1696
1697 }
1698 } else {
1699 die "$0: $self->{ct}->{type}: Unknown token type";
1700 }
1701 $self->{state} = DATA_STATE;
1702 $self->{s_kwd} = '';
1703
1704 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705 $self->{line_prev} = $self->{line};
1706 $self->{column_prev} = $self->{column};
1707 $self->{column}++;
1708 $self->{nc}
1709 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710 } else {
1711 $self->{set_nc}->($self);
1712 }
1713
1714
1715 return ($self->{ct}); # start tag or end tag
1716
1717 redo A;
1718 } elsif ($self->{nc} == -1) {
1719 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1720 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1721
1722 $self->{last_stag_name} = $self->{ct}->{tag_name};
1723 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1724 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1725 if ($self->{ct}->{attributes}) {
1726
1727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1728 } else {
1729 ## NOTE: This state should never be reached.
1730
1731 }
1732 } else {
1733 die "$0: $self->{ct}->{type}: Unknown token type";
1734 }
1735 $self->{state} = DATA_STATE;
1736 $self->{s_kwd} = '';
1737 ## reconsume
1738
1739 return ($self->{ct}); # start tag or end tag
1740
1741 redo A;
1742 } else {
1743 if ($self->{nc} == 0x003D) { # =
1744
1745 ## XML5: Not a parse error.
1746 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747 } elsif ($self->{is_xml}) {
1748
1749 ## XML5: No parse error.
1750 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751 } else {
1752
1753 }
1754 $self->{ca}->{value} .= chr ($self->{nc});
1755 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1756
1757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1758 $self->{line_prev} = $self->{line};
1759 $self->{column_prev} = $self->{column};
1760 $self->{column}++;
1761 $self->{nc}
1762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1763 } else {
1764 $self->{set_nc}->($self);
1765 }
1766
1767 redo A;
1768 }
1769 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771 ## ATTLIST attribute value double quoted state".
1772
1773 if ($self->{nc} == 0x0022) { # "
1774 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775
1776 ## XML5: "DOCTYPE ATTLIST name after state".
1777 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779 } else {
1780
1781 ## XML5: "Tag attribute name before state".
1782 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783 }
1784
1785 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786 $self->{line_prev} = $self->{line};
1787 $self->{column_prev} = $self->{column};
1788 $self->{column}++;
1789 $self->{nc}
1790 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1791 } else {
1792 $self->{set_nc}->($self);
1793 }
1794
1795 redo A;
1796 } elsif ($self->{nc} == 0x0026) { # &
1797
1798 ## XML5: Not defined yet.
1799
1800 ## NOTE: In the spec, the tokenizer is switched to the
1801 ## "entity in attribute value state". In this implementation, the
1802 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1803 ## implementation of the "consume a character reference" algorithm.
1804 $self->{prev_state} = $self->{state};
1805 $self->{entity_add} = 0x0022; # "
1806 $self->{state} = ENTITY_STATE;
1807
1808 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809 $self->{line_prev} = $self->{line};
1810 $self->{column_prev} = $self->{column};
1811 $self->{column}++;
1812 $self->{nc}
1813 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814 } else {
1815 $self->{set_nc}->($self);
1816 }
1817
1818 redo A;
1819 } elsif ($self->{nc} == -1) {
1820 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1821 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822
1823 $self->{last_stag_name} = $self->{ct}->{tag_name};
1824
1825 $self->{state} = DATA_STATE;
1826 $self->{s_kwd} = '';
1827 ## reconsume
1828 return ($self->{ct}); # start tag
1829 redo A;
1830 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832 if ($self->{ct}->{attributes}) {
1833
1834 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1835 } else {
1836 ## NOTE: This state should never be reached.
1837
1838 }
1839
1840 $self->{state} = DATA_STATE;
1841 $self->{s_kwd} = '';
1842 ## reconsume
1843 return ($self->{ct}); # end tag
1844 redo A;
1845 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846 ## XML5: No parse error above; not defined yet.
1847 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849 ## Reconsume.
1850 return ($self->{ct}); # ATTLIST
1851 redo A;
1852 } else {
1853 die "$0: $self->{ct}->{type}: Unknown token type";
1854 }
1855 } else {
1856 ## XML5 [ATTLIST]: Not defined yet.
1857 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858
1859 ## XML5: Not a parse error.
1860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861 } else {
1862
1863 }
1864 $self->{ca}->{value} .= chr ($self->{nc});
1865 $self->{read_until}->($self->{ca}->{value},
1866 q["&<],
1867 length $self->{ca}->{value});
1868
1869 ## Stay in the state
1870
1871 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1872 $self->{line_prev} = $self->{line};
1873 $self->{column_prev} = $self->{column};
1874 $self->{column}++;
1875 $self->{nc}
1876 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1877 } else {
1878 $self->{set_nc}->($self);
1879 }
1880
1881 redo A;
1882 }
1883 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885 ## ATTLIST attribute value single quoted state".
1886
1887 if ($self->{nc} == 0x0027) { # '
1888 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889
1890 ## XML5: "DOCTYPE ATTLIST name after state".
1891 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893 } else {
1894
1895 ## XML5: "Before attribute name state" (sic).
1896 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897 }
1898
1899 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900 $self->{line_prev} = $self->{line};
1901 $self->{column_prev} = $self->{column};
1902 $self->{column}++;
1903 $self->{nc}
1904 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1905 } else {
1906 $self->{set_nc}->($self);
1907 }
1908
1909 redo A;
1910 } elsif ($self->{nc} == 0x0026) { # &
1911
1912 ## XML5: Not defined yet.
1913
1914 ## NOTE: In the spec, the tokenizer is switched to the
1915 ## "entity in attribute value state". In this implementation, the
1916 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1917 ## implementation of the "consume a character reference" algorithm.
1918 $self->{entity_add} = 0x0027; # '
1919 $self->{prev_state} = $self->{state};
1920 $self->{state} = ENTITY_STATE;
1921
1922 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923 $self->{line_prev} = $self->{line};
1924 $self->{column_prev} = $self->{column};
1925 $self->{column}++;
1926 $self->{nc}
1927 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928 } else {
1929 $self->{set_nc}->($self);
1930 }
1931
1932 redo A;
1933 } elsif ($self->{nc} == -1) {
1934 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1935 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936
1937 $self->{last_stag_name} = $self->{ct}->{tag_name};
1938
1939 $self->{state} = DATA_STATE;
1940 $self->{s_kwd} = '';
1941 ## reconsume
1942 return ($self->{ct}); # start tag
1943 redo A;
1944 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946 if ($self->{ct}->{attributes}) {
1947
1948 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1949 } else {
1950 ## NOTE: This state should never be reached.
1951
1952 }
1953
1954 $self->{state} = DATA_STATE;
1955 $self->{s_kwd} = '';
1956 ## reconsume
1957 return ($self->{ct}); # end tag
1958 redo A;
1959 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960 ## XML5: No parse error above; not defined yet.
1961 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963 ## Reconsume.
1964 return ($self->{ct}); # ATTLIST
1965 redo A;
1966 } else {
1967 die "$0: $self->{ct}->{type}: Unknown token type";
1968 }
1969 } else {
1970 ## XML5 [ATTLIST]: Not defined yet.
1971 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972
1973 ## XML5: Not a parse error.
1974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975 } else {
1976
1977 }
1978 $self->{ca}->{value} .= chr ($self->{nc});
1979 $self->{read_until}->($self->{ca}->{value},
1980 q['&<],
1981 length $self->{ca}->{value});
1982
1983 ## Stay in the state
1984
1985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986 $self->{line_prev} = $self->{line};
1987 $self->{column_prev} = $self->{column};
1988 $self->{column}++;
1989 $self->{nc}
1990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991 } else {
1992 $self->{set_nc}->($self);
1993 }
1994
1995 redo A;
1996 }
1997 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998 ## XML5: "Tag attribute value unquoted state".
1999
2000 if ($is_space->{$self->{nc}}) {
2001 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002
2003 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005 } else {
2006
2007 ## XML5: "Tag attribute name before state".
2008 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009 }
2010
2011 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012 $self->{line_prev} = $self->{line};
2013 $self->{column_prev} = $self->{column};
2014 $self->{column}++;
2015 $self->{nc}
2016 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017 } else {
2018 $self->{set_nc}->($self);
2019 }
2020
2021 redo A;
2022 } elsif ($self->{nc} == 0x0026) { # &
2023
2024
2025 ## XML5: Not defined yet.
2026
2027 ## NOTE: In the spec, the tokenizer is switched to the
2028 ## "entity in attribute value state". In this implementation, the
2029 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2030 ## implementation of the "consume a character reference" algorithm.
2031 $self->{entity_add} = -1;
2032 $self->{prev_state} = $self->{state};
2033 $self->{state} = ENTITY_STATE;
2034
2035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2036 $self->{line_prev} = $self->{line};
2037 $self->{column_prev} = $self->{column};
2038 $self->{column}++;
2039 $self->{nc}
2040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2041 } else {
2042 $self->{set_nc}->($self);
2043 }
2044
2045 redo A;
2046 } elsif ($self->{nc} == 0x003E) { # >
2047 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048
2049 $self->{last_stag_name} = $self->{ct}->{tag_name};
2050
2051 $self->{state} = DATA_STATE;
2052 $self->{s_kwd} = '';
2053
2054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055 $self->{line_prev} = $self->{line};
2056 $self->{column_prev} = $self->{column};
2057 $self->{column}++;
2058 $self->{nc}
2059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060 } else {
2061 $self->{set_nc}->($self);
2062 }
2063
2064 return ($self->{ct}); # start tag
2065 redo A;
2066 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068 if ($self->{ct}->{attributes}) {
2069
2070 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2071 } else {
2072 ## NOTE: This state should never be reached.
2073
2074 }
2075
2076 $self->{state} = DATA_STATE;
2077 $self->{s_kwd} = '';
2078
2079 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080 $self->{line_prev} = $self->{line};
2081 $self->{column_prev} = $self->{column};
2082 $self->{column}++;
2083 $self->{nc}
2084 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085 } else {
2086 $self->{set_nc}->($self);
2087 }
2088
2089 return ($self->{ct}); # end tag
2090 redo A;
2091 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094
2095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096 $self->{line_prev} = $self->{line};
2097 $self->{column_prev} = $self->{column};
2098 $self->{column}++;
2099 $self->{nc}
2100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101 } else {
2102 $self->{set_nc}->($self);
2103 }
2104
2105 return ($self->{ct}); # ATTLIST
2106 redo A;
2107 } else {
2108 die "$0: $self->{ct}->{type}: Unknown token type";
2109 }
2110 } elsif ($self->{nc} == -1) {
2111 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112
2113 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114 $self->{last_stag_name} = $self->{ct}->{tag_name};
2115
2116 $self->{state} = DATA_STATE;
2117 $self->{s_kwd} = '';
2118 ## reconsume
2119 return ($self->{ct}); # start tag
2120 redo A;
2121 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124 if ($self->{ct}->{attributes}) {
2125
2126 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2127 } else {
2128 ## NOTE: This state should never be reached.
2129
2130 }
2131
2132 $self->{state} = DATA_STATE;
2133 $self->{s_kwd} = '';
2134 ## reconsume
2135 return ($self->{ct}); # end tag
2136 redo A;
2137 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141 ## Reconsume.
2142 return ($self->{ct}); # ATTLIST
2143 redo A;
2144 } else {
2145 die "$0: $self->{ct}->{type}: Unknown token type";
2146 }
2147 } else {
2148 if ({
2149 0x0022 => 1, # "
2150 0x0027 => 1, # '
2151 0x003D => 1, # =
2152 }->{$self->{nc}}) {
2153
2154 ## XML5: Not a parse error.
2155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156 } else {
2157
2158 }
2159 $self->{ca}->{value} .= chr ($self->{nc});
2160 $self->{read_until}->($self->{ca}->{value},
2161 q["'=& >],
2162 length $self->{ca}->{value});
2163
2164 ## Stay in the state
2165
2166 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2167 $self->{line_prev} = $self->{line};
2168 $self->{column_prev} = $self->{column};
2169 $self->{column}++;
2170 $self->{nc}
2171 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2172 } else {
2173 $self->{set_nc}->($self);
2174 }
2175
2176 redo A;
2177 }
2178 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2179 if ($is_space->{$self->{nc}}) {
2180
2181 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2182
2183 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2184 $self->{line_prev} = $self->{line};
2185 $self->{column_prev} = $self->{column};
2186 $self->{column}++;
2187 $self->{nc}
2188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2189 } else {
2190 $self->{set_nc}->($self);
2191 }
2192
2193 redo A;
2194 } elsif ($self->{nc} == 0x003E) { # >
2195 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2196
2197 $self->{last_stag_name} = $self->{ct}->{tag_name};
2198 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2199 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2200 if ($self->{ct}->{attributes}) {
2201
2202 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2203 } else {
2204 ## NOTE: This state should never be reached.
2205
2206 }
2207 } else {
2208 die "$0: $self->{ct}->{type}: Unknown token type";
2209 }
2210 $self->{state} = DATA_STATE;
2211 $self->{s_kwd} = '';
2212
2213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214 $self->{line_prev} = $self->{line};
2215 $self->{column_prev} = $self->{column};
2216 $self->{column}++;
2217 $self->{nc}
2218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2219 } else {
2220 $self->{set_nc}->($self);
2221 }
2222
2223
2224 return ($self->{ct}); # start tag or end tag
2225
2226 redo A;
2227 } elsif ($self->{nc} == 0x002F) { # /
2228
2229 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2230
2231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2232 $self->{line_prev} = $self->{line};
2233 $self->{column_prev} = $self->{column};
2234 $self->{column}++;
2235 $self->{nc}
2236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2237 } else {
2238 $self->{set_nc}->($self);
2239 }
2240
2241 redo A;
2242 } elsif ($self->{nc} == -1) {
2243 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2244 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2245
2246 $self->{last_stag_name} = $self->{ct}->{tag_name};
2247 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2248 if ($self->{ct}->{attributes}) {
2249
2250 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2251 } else {
2252 ## NOTE: This state should never be reached.
2253
2254 }
2255 } else {
2256 die "$0: $self->{ct}->{type}: Unknown token type";
2257 }
2258 $self->{state} = DATA_STATE;
2259 $self->{s_kwd} = '';
2260 ## Reconsume.
2261 return ($self->{ct}); # start tag or end tag
2262 redo A;
2263 } else {
2264
2265 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2266 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2267 ## reconsume
2268 redo A;
2269 }
2270 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271 ## XML5: "Empty tag state".
2272
2273 if ($self->{nc} == 0x003E) { # >
2274 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275
2276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2277 ## TODO: Different type than slash in start tag
2278 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2279 if ($self->{ct}->{attributes}) {
2280
2281 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2282 } else {
2283
2284 }
2285 ## TODO: Test |<title></title/>|
2286 } else {
2287
2288 $self->{self_closing} = 1;
2289 }
2290
2291 $self->{state} = DATA_STATE;
2292 $self->{s_kwd} = '';
2293
2294 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295 $self->{line_prev} = $self->{line};
2296 $self->{column_prev} = $self->{column};
2297 $self->{column}++;
2298 $self->{nc}
2299 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300 } else {
2301 $self->{set_nc}->($self);
2302 }
2303
2304
2305 return ($self->{ct}); # start tag or end tag
2306
2307 redo A;
2308 } elsif ($self->{nc} == -1) {
2309 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2310 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2311
2312 $self->{last_stag_name} = $self->{ct}->{tag_name};
2313 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2314 if ($self->{ct}->{attributes}) {
2315
2316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2317 } else {
2318 ## NOTE: This state should never be reached.
2319
2320 }
2321 } else {
2322 die "$0: $self->{ct}->{type}: Unknown token type";
2323 }
2324 ## XML5: "Tag attribute name before state".
2325 $self->{state} = DATA_STATE;
2326 $self->{s_kwd} = '';
2327 ## Reconsume.
2328 return ($self->{ct}); # start tag or end tag
2329 redo A;
2330 } else {
2331
2332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2333 ## TODO: This error type is wrong.
2334 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2335 ## Reconsume.
2336 redo A;
2337 }
2338 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340
2341 ## NOTE: Unlike spec's "bogus comment state", this implementation
2342 ## consumes characters one-by-one basis.
2343
2344 if ($self->{nc} == 0x003E) { # >
2345 if ($self->{in_subset}) {
2346
2347 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348 } else {
2349
2350 $self->{state} = DATA_STATE;
2351 $self->{s_kwd} = '';
2352 }
2353
2354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355 $self->{line_prev} = $self->{line};
2356 $self->{column_prev} = $self->{column};
2357 $self->{column}++;
2358 $self->{nc}
2359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2360 } else {
2361 $self->{set_nc}->($self);
2362 }
2363
2364
2365 return ($self->{ct}); # comment
2366 redo A;
2367 } elsif ($self->{nc} == -1) {
2368 if ($self->{in_subset}) {
2369
2370 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371 } else {
2372
2373 $self->{state} = DATA_STATE;
2374 $self->{s_kwd} = '';
2375 }
2376 ## reconsume
2377
2378 return ($self->{ct}); # comment
2379 redo A;
2380 } else {
2381
2382 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2383 $self->{read_until}->($self->{ct}->{data},
2384 q[>],
2385 length $self->{ct}->{data});
2386
2387 ## Stay in the state.
2388
2389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390 $self->{line_prev} = $self->{line};
2391 $self->{column_prev} = $self->{column};
2392 $self->{column}++;
2393 $self->{nc}
2394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395 } else {
2396 $self->{set_nc}->($self);
2397 }
2398
2399 redo A;
2400 }
2401 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402 ## XML5: "Markup declaration state".
2403
2404 if ($self->{nc} == 0x002D) { # -
2405
2406 $self->{state} = MD_HYPHEN_STATE;
2407
2408 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409 $self->{line_prev} = $self->{line};
2410 $self->{column_prev} = $self->{column};
2411 $self->{column}++;
2412 $self->{nc}
2413 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414 } else {
2415 $self->{set_nc}->($self);
2416 }
2417
2418 redo A;
2419 } elsif ($self->{nc} == 0x0044 or # D
2420 $self->{nc} == 0x0064) { # d
2421 ## ASCII case-insensitive.
2422
2423 $self->{state} = MD_DOCTYPE_STATE;
2424 $self->{kwd} = chr $self->{nc};
2425
2426 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427 $self->{line_prev} = $self->{line};
2428 $self->{column_prev} = $self->{column};
2429 $self->{column}++;
2430 $self->{nc}
2431 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2432 } else {
2433 $self->{set_nc}->($self);
2434 }
2435
2436 redo A;
2437 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2438 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2439 $self->{is_xml}) and
2440 $self->{nc} == 0x005B) { # [
2441
2442 $self->{state} = MD_CDATA_STATE;
2443 $self->{kwd} = '[';
2444
2445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446 $self->{line_prev} = $self->{line};
2447 $self->{column_prev} = $self->{column};
2448 $self->{column}++;
2449 $self->{nc}
2450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2451 } else {
2452 $self->{set_nc}->($self);
2453 }
2454
2455 redo A;
2456 } else {
2457
2458 }
2459
2460 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2461 line => $self->{line_prev},
2462 column => $self->{column_prev} - 1);
2463 ## Reconsume.
2464 $self->{state} = BOGUS_COMMENT_STATE;
2465 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2466 line => $self->{line_prev},
2467 column => $self->{column_prev} - 1,
2468 };
2469 redo A;
2470 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2471 if ($self->{nc} == 0x002D) { # -
2472
2473 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2474 line => $self->{line_prev},
2475 column => $self->{column_prev} - 2,
2476 };
2477 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478
2479 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480 $self->{line_prev} = $self->{line};
2481 $self->{column_prev} = $self->{column};
2482 $self->{column}++;
2483 $self->{nc}
2484 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485 } else {
2486 $self->{set_nc}->($self);
2487 }
2488
2489 redo A;
2490 } else {
2491
2492 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2493 line => $self->{line_prev},
2494 column => $self->{column_prev} - 2);
2495 $self->{state} = BOGUS_COMMENT_STATE;
2496 ## Reconsume.
2497 $self->{ct} = {type => COMMENT_TOKEN,
2498 data => '-',
2499 line => $self->{line_prev},
2500 column => $self->{column_prev} - 2,
2501 };
2502 redo A;
2503 }
2504 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2505 ## ASCII case-insensitive.
2506 if ($self->{nc} == [
2507 undef,
2508 0x004F, # O
2509 0x0043, # C
2510 0x0054, # T
2511 0x0059, # Y
2512 0x0050, # P
2513 ]->[length $self->{kwd}] or
2514 $self->{nc} == [
2515 undef,
2516 0x006F, # o
2517 0x0063, # c
2518 0x0074, # t
2519 0x0079, # y
2520 0x0070, # p
2521 ]->[length $self->{kwd}]) {
2522
2523 ## Stay in the state.
2524 $self->{kwd} .= chr $self->{nc};
2525
2526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527 $self->{line_prev} = $self->{line};
2528 $self->{column_prev} = $self->{column};
2529 $self->{column}++;
2530 $self->{nc}
2531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532 } else {
2533 $self->{set_nc}->($self);
2534 }
2535
2536 redo A;
2537 } elsif ((length $self->{kwd}) == 6 and
2538 ($self->{nc} == 0x0045 or # E
2539 $self->{nc} == 0x0065)) { # e
2540 if ($self->{is_xml} and
2541 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542
2543 ## XML5: case-sensitive.
2544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545 text => 'DOCTYPE',
2546 line => $self->{line_prev},
2547 column => $self->{column_prev} - 5);
2548 } else {
2549
2550 }
2551 $self->{state} = DOCTYPE_STATE;
2552 $self->{ct} = {type => DOCTYPE_TOKEN,
2553 quirks => 1,
2554 line => $self->{line_prev},
2555 column => $self->{column_prev} - 7,
2556 };
2557
2558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559 $self->{line_prev} = $self->{line};
2560 $self->{column_prev} = $self->{column};
2561 $self->{column}++;
2562 $self->{nc}
2563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564 } else {
2565 $self->{set_nc}->($self);
2566 }
2567
2568 redo A;
2569 } else {
2570
2571 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572 line => $self->{line_prev},
2573 column => $self->{column_prev} - 1 - length $self->{kwd});
2574 $self->{state} = BOGUS_COMMENT_STATE;
2575 ## Reconsume.
2576 $self->{ct} = {type => COMMENT_TOKEN,
2577 data => $self->{kwd},
2578 line => $self->{line_prev},
2579 column => $self->{column_prev} - 1 - length $self->{kwd},
2580 };
2581 redo A;
2582 }
2583 } elsif ($self->{state} == MD_CDATA_STATE) {
2584 if ($self->{nc} == {
2585 '[' => 0x0043, # C
2586 '[C' => 0x0044, # D
2587 '[CD' => 0x0041, # A
2588 '[CDA' => 0x0054, # T
2589 '[CDAT' => 0x0041, # A
2590 }->{$self->{kwd}}) {
2591
2592 ## Stay in the state.
2593 $self->{kwd} .= chr $self->{nc};
2594
2595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596 $self->{line_prev} = $self->{line};
2597 $self->{column_prev} = $self->{column};
2598 $self->{column}++;
2599 $self->{nc}
2600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601 } else {
2602 $self->{set_nc}->($self);
2603 }
2604
2605 redo A;
2606 } elsif ($self->{kwd} eq '[CDATA' and
2607 $self->{nc} == 0x005B) { # [
2608 if ($self->{is_xml} and
2609 not $self->{tainted} and
2610 @{$self->{open_elements} or []} == 0) {
2611
2612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613 line => $self->{line_prev},
2614 column => $self->{column_prev} - 7);
2615 $self->{tainted} = 1;
2616 } else {
2617
2618 }
2619
2620 $self->{ct} = {type => CHARACTER_TOKEN,
2621 data => '',
2622 line => $self->{line_prev},
2623 column => $self->{column_prev} - 7};
2624 $self->{state} = CDATA_SECTION_STATE;
2625
2626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2627 $self->{line_prev} = $self->{line};
2628 $self->{column_prev} = $self->{column};
2629 $self->{column}++;
2630 $self->{nc}
2631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2632 } else {
2633 $self->{set_nc}->($self);
2634 }
2635
2636 redo A;
2637 } else {
2638
2639 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640 line => $self->{line_prev},
2641 column => $self->{column_prev} - 1 - length $self->{kwd});
2642 $self->{state} = BOGUS_COMMENT_STATE;
2643 ## Reconsume.
2644 $self->{ct} = {type => COMMENT_TOKEN,
2645 data => $self->{kwd},
2646 line => $self->{line_prev},
2647 column => $self->{column_prev} - 1 - length $self->{kwd},
2648 };
2649 redo A;
2650 }
2651 } elsif ($self->{state} == COMMENT_START_STATE) {
2652 if ($self->{nc} == 0x002D) { # -
2653
2654 $self->{state} = COMMENT_START_DASH_STATE;
2655
2656 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2657 $self->{line_prev} = $self->{line};
2658 $self->{column_prev} = $self->{column};
2659 $self->{column}++;
2660 $self->{nc}
2661 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2662 } else {
2663 $self->{set_nc}->($self);
2664 }
2665
2666 redo A;
2667 } elsif ($self->{nc} == 0x003E) { # >
2668 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669 if ($self->{in_subset}) {
2670
2671 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672 } else {
2673
2674 $self->{state} = DATA_STATE;
2675 $self->{s_kwd} = '';
2676 }
2677
2678 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679 $self->{line_prev} = $self->{line};
2680 $self->{column_prev} = $self->{column};
2681 $self->{column}++;
2682 $self->{nc}
2683 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684 } else {
2685 $self->{set_nc}->($self);
2686 }
2687
2688
2689 return ($self->{ct}); # comment
2690
2691 redo A;
2692 } elsif ($self->{nc} == -1) {
2693 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694 if ($self->{in_subset}) {
2695
2696 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697 } else {
2698
2699 $self->{state} = DATA_STATE;
2700 $self->{s_kwd} = '';
2701 }
2702 ## reconsume
2703
2704 return ($self->{ct}); # comment
2705
2706 redo A;
2707 } else {
2708
2709 $self->{ct}->{data} # comment
2710 .= chr ($self->{nc});
2711 $self->{state} = COMMENT_STATE;
2712
2713 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714 $self->{line_prev} = $self->{line};
2715 $self->{column_prev} = $self->{column};
2716 $self->{column}++;
2717 $self->{nc}
2718 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2719 } else {
2720 $self->{set_nc}->($self);
2721 }
2722
2723 redo A;
2724 }
2725 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2726 if ($self->{nc} == 0x002D) { # -
2727
2728 $self->{state} = COMMENT_END_STATE;
2729
2730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2731 $self->{line_prev} = $self->{line};
2732 $self->{column_prev} = $self->{column};
2733 $self->{column}++;
2734 $self->{nc}
2735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2736 } else {
2737 $self->{set_nc}->($self);
2738 }
2739
2740 redo A;
2741 } elsif ($self->{nc} == 0x003E) { # >
2742 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743 if ($self->{in_subset}) {
2744
2745 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746 } else {
2747
2748 $self->{state} = DATA_STATE;
2749 $self->{s_kwd} = '';
2750 }
2751
2752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753 $self->{line_prev} = $self->{line};
2754 $self->{column_prev} = $self->{column};
2755 $self->{column}++;
2756 $self->{nc}
2757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2758 } else {
2759 $self->{set_nc}->($self);
2760 }
2761
2762
2763 return ($self->{ct}); # comment
2764
2765 redo A;
2766 } elsif ($self->{nc} == -1) {
2767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768 if ($self->{in_subset}) {
2769
2770 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771 } else {
2772
2773 $self->{state} = DATA_STATE;
2774 $self->{s_kwd} = '';
2775 }
2776 ## reconsume
2777
2778 return ($self->{ct}); # comment
2779
2780 redo A;
2781 } else {
2782
2783 $self->{ct}->{data} # comment
2784 .= '-' . chr ($self->{nc});
2785 $self->{state} = COMMENT_STATE;
2786
2787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788 $self->{line_prev} = $self->{line};
2789 $self->{column_prev} = $self->{column};
2790 $self->{column}++;
2791 $self->{nc}
2792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793 } else {
2794 $self->{set_nc}->($self);
2795 }
2796
2797 redo A;
2798 }
2799 } elsif ($self->{state} == COMMENT_STATE) {
2800 ## XML5: "Comment state" and "DOCTYPE comment state".
2801
2802 if ($self->{nc} == 0x002D) { # -
2803
2804 $self->{state} = COMMENT_END_DASH_STATE;
2805
2806 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807 $self->{line_prev} = $self->{line};
2808 $self->{column_prev} = $self->{column};
2809 $self->{column}++;
2810 $self->{nc}
2811 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812 } else {
2813 $self->{set_nc}->($self);
2814 }
2815
2816 redo A;
2817 } elsif ($self->{nc} == -1) {
2818 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819 if ($self->{in_subset}) {
2820
2821 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822 } else {
2823
2824 $self->{state} = DATA_STATE;
2825 $self->{s_kwd} = '';
2826 }
2827 ## reconsume
2828
2829 return ($self->{ct}); # comment
2830
2831 redo A;
2832 } else {
2833
2834 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2835 $self->{read_until}->($self->{ct}->{data},
2836 q[-],
2837 length $self->{ct}->{data});
2838
2839 ## Stay in the state
2840
2841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2842 $self->{line_prev} = $self->{line};
2843 $self->{column_prev} = $self->{column};
2844 $self->{column}++;
2845 $self->{nc}
2846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2847 } else {
2848 $self->{set_nc}->($self);
2849 }
2850
2851 redo A;
2852 }
2853 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855
2856 if ($self->{nc} == 0x002D) { # -
2857
2858 $self->{state} = COMMENT_END_STATE;
2859
2860 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2861 $self->{line_prev} = $self->{line};
2862 $self->{column_prev} = $self->{column};
2863 $self->{column}++;
2864 $self->{nc}
2865 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2866 } else {
2867 $self->{set_nc}->($self);
2868 }
2869
2870 redo A;
2871 } elsif ($self->{nc} == -1) {
2872 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873 if ($self->{in_subset}) {
2874
2875 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876 } else {
2877
2878 $self->{state} = DATA_STATE;
2879 $self->{s_kwd} = '';
2880 }
2881 ## reconsume
2882
2883 return ($self->{ct}); # comment
2884
2885 redo A;
2886 } else {
2887
2888 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2889 $self->{state} = COMMENT_STATE;
2890
2891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2892 $self->{line_prev} = $self->{line};
2893 $self->{column_prev} = $self->{column};
2894 $self->{column}++;
2895 $self->{nc}
2896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2897 } else {
2898 $self->{set_nc}->($self);
2899 }
2900
2901 redo A;
2902 }
2903 } elsif ($self->{state} == COMMENT_END_STATE) {
2904 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905
2906 if ($self->{nc} == 0x003E) { # >
2907 if ($self->{in_subset}) {
2908
2909 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910 } else {
2911
2912 $self->{state} = DATA_STATE;
2913 $self->{s_kwd} = '';
2914 }
2915
2916 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917 $self->{line_prev} = $self->{line};
2918 $self->{column_prev} = $self->{column};
2919 $self->{column}++;
2920 $self->{nc}
2921 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922 } else {
2923 $self->{set_nc}->($self);
2924 }
2925
2926
2927 return ($self->{ct}); # comment
2928
2929 redo A;
2930 } elsif ($self->{nc} == 0x002D) { # -
2931
2932 ## XML5: Not a parse error.
2933 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934 line => $self->{line_prev},
2935 column => $self->{column_prev});
2936 $self->{ct}->{data} .= '-'; # comment
2937 ## Stay in the state
2938
2939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940 $self->{line_prev} = $self->{line};
2941 $self->{column_prev} = $self->{column};
2942 $self->{column}++;
2943 $self->{nc}
2944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945 } else {
2946 $self->{set_nc}->($self);
2947 }
2948
2949 redo A;
2950 } elsif ($self->{nc} == -1) {
2951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 if ($self->{in_subset}) {
2953
2954 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955 } else {
2956
2957 $self->{state} = DATA_STATE;
2958 $self->{s_kwd} = '';
2959 }
2960 ## reconsume
2961
2962 return ($self->{ct}); # comment
2963
2964 redo A;
2965 } else {
2966
2967 ## XML5: Not a parse error.
2968 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969 line => $self->{line_prev},
2970 column => $self->{column_prev});
2971 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2972 $self->{state} = COMMENT_STATE;
2973
2974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2975 $self->{line_prev} = $self->{line};
2976 $self->{column_prev} = $self->{column};
2977 $self->{column}++;
2978 $self->{nc}
2979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2980 } else {
2981 $self->{set_nc}->($self);
2982 }
2983
2984 redo A;
2985 }
2986 } elsif ($self->{state} == DOCTYPE_STATE) {
2987 if ($is_space->{$self->{nc}}) {
2988
2989 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2990
2991 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992 $self->{line_prev} = $self->{line};
2993 $self->{column_prev} = $self->{column};
2994 $self->{column}++;
2995 $self->{nc}
2996 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997 } else {
2998 $self->{set_nc}->($self);
2999 }
3000
3001 redo A;
3002 } else {
3003
3004 ## XML5: Unless EOF, swith to the bogus comment state.
3005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007 ## reconsume
3008 redo A;
3009 }
3010 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011 ## XML5: "DOCTYPE root name before state".
3012
3013 if ($is_space->{$self->{nc}}) {
3014
3015 ## Stay in the state
3016
3017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3018 $self->{line_prev} = $self->{line};
3019 $self->{column_prev} = $self->{column};
3020 $self->{column}++;
3021 $self->{nc}
3022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3023 } else {
3024 $self->{set_nc}->($self);
3025 }
3026
3027 redo A;
3028 } elsif ($self->{nc} == 0x003E) { # >
3029
3030 ## XML5: No parse error.
3031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032 $self->{state} = DATA_STATE;
3033 $self->{s_kwd} = '';
3034
3035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036 $self->{line_prev} = $self->{line};
3037 $self->{column_prev} = $self->{column};
3038 $self->{column}++;
3039 $self->{nc}
3040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3041 } else {
3042 $self->{set_nc}->($self);
3043 }
3044
3045
3046 return ($self->{ct}); # DOCTYPE (quirks)
3047
3048 redo A;
3049 } elsif ($self->{nc} == -1) {
3050
3051 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052 $self->{state} = DATA_STATE;
3053 $self->{s_kwd} = '';
3054 ## reconsume
3055
3056 return ($self->{ct}); # DOCTYPE (quirks)
3057
3058 redo A;
3059 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060
3061 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064 $self->{in_subset} = 1;
3065
3066 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067 $self->{line_prev} = $self->{line};
3068 $self->{column_prev} = $self->{column};
3069 $self->{column}++;
3070 $self->{nc}
3071 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072 } else {
3073 $self->{set_nc}->($self);
3074 }
3075
3076 return ($self->{ct}); # DOCTYPE
3077 redo A;
3078 } else {
3079
3080 $self->{ct}->{name} = chr $self->{nc};
3081 delete $self->{ct}->{quirks};
3082 $self->{state} = DOCTYPE_NAME_STATE;
3083
3084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3085 $self->{line_prev} = $self->{line};
3086 $self->{column_prev} = $self->{column};
3087 $self->{column}++;
3088 $self->{nc}
3089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3090 } else {
3091 $self->{set_nc}->($self);
3092 }
3093
3094 redo A;
3095 }
3096 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097 ## XML5: "DOCTYPE root name state".
3098
3099 ## ISSUE: Redundant "First," in the spec.
3100
3101 if ($is_space->{$self->{nc}}) {
3102
3103 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3104
3105 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106 $self->{line_prev} = $self->{line};
3107 $self->{column_prev} = $self->{column};
3108 $self->{column}++;
3109 $self->{nc}
3110 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111 } else {
3112 $self->{set_nc}->($self);
3113 }
3114
3115 redo A;
3116 } elsif ($self->{nc} == 0x003E) { # >
3117
3118 $self->{state} = DATA_STATE;
3119 $self->{s_kwd} = '';
3120
3121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122 $self->{line_prev} = $self->{line};
3123 $self->{column_prev} = $self->{column};
3124 $self->{column}++;
3125 $self->{nc}
3126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127 } else {
3128 $self->{set_nc}->($self);
3129 }
3130
3131
3132 return ($self->{ct}); # DOCTYPE
3133
3134 redo A;
3135 } elsif ($self->{nc} == -1) {
3136
3137 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138 $self->{state} = DATA_STATE;
3139 $self->{s_kwd} = '';
3140 ## reconsume
3141
3142 $self->{ct}->{quirks} = 1;
3143 return ($self->{ct}); # DOCTYPE
3144
3145 redo A;
3146 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147
3148 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150 $self->{in_subset} = 1;
3151
3152 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153 $self->{line_prev} = $self->{line};
3154 $self->{column_prev} = $self->{column};
3155 $self->{column}++;
3156 $self->{nc}
3157 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158 } else {
3159 $self->{set_nc}->($self);
3160 }
3161
3162 return ($self->{ct}); # DOCTYPE
3163 redo A;
3164 } else {
3165
3166 $self->{ct}->{name}
3167 .= chr ($self->{nc}); # DOCTYPE
3168 ## Stay in the state
3169
3170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3171 $self->{line_prev} = $self->{line};
3172 $self->{column_prev} = $self->{column};
3173 $self->{column}++;
3174 $self->{nc}
3175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3176 } else {
3177 $self->{set_nc}->($self);
3178 }
3179
3180 redo A;
3181 }
3182 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184 ## state", but implemented differently.
3185
3186 if ($is_space->{$self->{nc}}) {
3187
3188 ## Stay in the state
3189
3190 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3191 $self->{line_prev} = $self->{line};
3192 $self->{column_prev} = $self->{column};
3193 $self->{column}++;
3194 $self->{nc}
3195 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3196 } else {
3197 $self->{set_nc}->($self);
3198 }
3199
3200 redo A;
3201 } elsif ($self->{nc} == 0x003E) { # >
3202 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203
3204 $self->{state} = DATA_STATE;
3205 $self->{s_kwd} = '';
3206 } else {
3207
3208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210 }
3211
3212
3213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214 $self->{line_prev} = $self->{line};
3215 $self->{column_prev} = $self->{column};
3216 $self->{column}++;
3217 $self->{nc}
3218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3219 } else {
3220 $self->{set_nc}->($self);
3221 }
3222
3223 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3224 redo A;
3225 } elsif ($self->{nc} == -1) {
3226 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227
3228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229 $self->{state} = DATA_STATE;
3230 $self->{s_kwd} = '';
3231 $self->{ct}->{quirks} = 1;
3232 } else {
3233
3234 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236 }
3237
3238 ## Reconsume.
3239 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3240 redo A;
3241 } elsif ($self->{nc} == 0x0050 or # P
3242 $self->{nc} == 0x0070) { # p
3243
3244 $self->{state} = PUBLIC_STATE;
3245 $self->{kwd} = chr $self->{nc};
3246
3247 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248 $self->{line_prev} = $self->{line};
3249 $self->{column_prev} = $self->{column};
3250 $self->{column}++;
3251 $self->{nc}
3252 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3253 } else {
3254 $self->{set_nc}->($self);
3255 }
3256
3257 redo A;
3258 } elsif ($self->{nc} == 0x0053 or # S
3259 $self->{nc} == 0x0073) { # s
3260
3261 $self->{state} = SYSTEM_STATE;
3262 $self->{kwd} = chr $self->{nc};
3263
3264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265 $self->{line_prev} = $self->{line};
3266 $self->{column_prev} = $self->{column};
3267 $self->{column}++;
3268 $self->{nc}
3269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270 } else {
3271 $self->{set_nc}->($self);
3272 }
3273
3274 redo A;
3275 } elsif ($self->{nc} == 0x0022 and # "
3276 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278
3279 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280 $self->{ct}->{value} = ''; # ENTITY
3281
3282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283 $self->{line_prev} = $self->{line};
3284 $self->{column_prev} = $self->{column};
3285 $self->{column}++;
3286 $self->{nc}
3287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288 } else {
3289 $self->{set_nc}->($self);
3290 }
3291
3292 redo A;
3293 } elsif ($self->{nc} == 0x0027 and # '
3294 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296
3297 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298 $self->{ct}->{value} = ''; # ENTITY
3299
3300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301 $self->{line_prev} = $self->{line};
3302 $self->{column_prev} = $self->{column};
3303 $self->{column}++;
3304 $self->{nc}
3305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306 } else {
3307 $self->{set_nc}->($self);
3308 }
3309
3310 redo A;
3311 } elsif ($self->{is_xml} and
3312 $self->{ct}->{type} == DOCTYPE_TOKEN and
3313 $self->{nc} == 0x005B) { # [
3314
3315 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317 $self->{in_subset} = 1;
3318
3319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320 $self->{line_prev} = $self->{line};
3321 $self->{column_prev} = $self->{column};
3322 $self->{column}++;
3323 $self->{nc}
3324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325 } else {
3326 $self->{set_nc}->($self);
3327 }
3328
3329 return ($self->{ct}); # DOCTYPE
3330 redo A;
3331 } else {
3332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333
3334 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335
3336 $self->{ct}->{quirks} = 1;
3337 $self->{state} = BOGUS_DOCTYPE_STATE;
3338 } else {
3339
3340 $self->{state} = BOGUS_MD_STATE;
3341 }
3342
3343
3344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345 $self->{line_prev} = $self->{line};
3346 $self->{column_prev} = $self->{column};
3347 $self->{column}++;
3348 $self->{nc}
3349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3350 } else {
3351 $self->{set_nc}->($self);
3352 }
3353
3354 redo A;
3355 }
3356 } elsif ($self->{state} == PUBLIC_STATE) {
3357 ## ASCII case-insensitive
3358 if ($self->{nc} == [
3359 undef,
3360 0x0055, # U
3361 0x0042, # B
3362 0x004C, # L
3363 0x0049, # I
3364 ]->[length $self->{kwd}] or
3365 $self->{nc} == [
3366 undef,
3367 0x0075, # u
3368 0x0062, # b
3369 0x006C, # l
3370 0x0069, # i
3371 ]->[length $self->{kwd}]) {
3372
3373 ## Stay in the state.
3374 $self->{kwd} .= chr $self->{nc};
3375
3376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377 $self->{line_prev} = $self->{line};
3378 $self->{column_prev} = $self->{column};
3379 $self->{column}++;
3380 $self->{nc}
3381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3382 } else {
3383 $self->{set_nc}->($self);
3384 }
3385
3386 redo A;
3387 } elsif ((length $self->{kwd}) == 5 and
3388 ($self->{nc} == 0x0043 or # C
3389 $self->{nc} == 0x0063)) { # c
3390 if ($self->{is_xml} and
3391 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392
3393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394 text => 'PUBLIC',
3395 line => $self->{line_prev},
3396 column => $self->{column_prev} - 4);
3397 } else {
3398
3399 }
3400 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401
3402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403 $self->{line_prev} = $self->{line};
3404 $self->{column_prev} = $self->{column};
3405 $self->{column}++;
3406 $self->{nc}
3407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408 } else {
3409 $self->{set_nc}->($self);
3410 }
3411
3412 redo A;
3413 } else {
3414 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3415 line => $self->{line_prev},
3416 column => $self->{column_prev} + 1 - length $self->{kwd});
3417 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418
3419 $self->{ct}->{quirks} = 1;
3420 $self->{state} = BOGUS_DOCTYPE_STATE;
3421 } else {
3422
3423 $self->{state} = BOGUS_MD_STATE;
3424 }
3425 ## Reconsume.
3426 redo A;
3427 }
3428 } elsif ($self->{state} == SYSTEM_STATE) {
3429 ## ASCII case-insensitive
3430 if ($self->{nc} == [
3431 undef,
3432 0x0059, # Y
3433 0x0053, # S
3434 0x0054, # T
3435 0x0045, # E
3436 ]->[length $self->{kwd}] or
3437 $self->{nc} == [
3438 undef,
3439 0x0079, # y
3440 0x0073, # s
3441 0x0074, # t
3442 0x0065, # e
3443 ]->[length $self->{kwd}]) {
3444
3445 ## Stay in the state.
3446 $self->{kwd} .= chr $self->{nc};
3447
3448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449 $self->{line_prev} = $self->{line};
3450 $self->{column_prev} = $self->{column};
3451 $self->{column}++;
3452 $self->{nc}
3453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3454 } else {
3455 $self->{set_nc}->($self);
3456 }
3457
3458 redo A;
3459 } elsif ((length $self->{kwd}) == 5 and
3460 ($self->{nc} == 0x004D or # M
3461 $self->{nc} == 0x006D)) { # m
3462 if ($self->{is_xml} and
3463 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464
3465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466 text => 'SYSTEM',
3467 line => $self->{line_prev},
3468 column => $self->{column_prev} - 4);
3469 } else {
3470
3471 }
3472 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473
3474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475 $self->{line_prev} = $self->{line};
3476 $self->{column_prev} = $self->{column};
3477 $self->{column}++;
3478 $self->{nc}
3479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480 } else {
3481 $self->{set_nc}->($self);
3482 }
3483
3484 redo A;
3485 } else {
3486 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3487 line => $self->{line_prev},
3488 column => $self->{column_prev} + 1 - length $self->{kwd});
3489 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490
3491 $self->{ct}->{quirks} = 1;
3492 $self->{state} = BOGUS_DOCTYPE_STATE;
3493 } else {
3494
3495 $self->{state} = BOGUS_MD_STATE;
3496 }
3497 ## Reconsume.
3498 redo A;
3499 }
3500 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3501 if ($is_space->{$self->{nc}}) {
3502
3503 ## Stay in the state
3504
3505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3506 $self->{line_prev} = $self->{line};
3507 $self->{column_prev} = $self->{column};
3508 $self->{column}++;
3509 $self->{nc}
3510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3511 } else {
3512 $self->{set_nc}->($self);
3513 }
3514
3515 redo A;
3516 } elsif ($self->{nc} eq 0x0022) { # "
3517
3518 $self->{ct}->{pubid} = ''; # DOCTYPE
3519 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3520
3521 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3522 $self->{line_prev} = $self->{line};
3523 $self->{column_prev} = $self->{column};
3524 $self->{column}++;
3525 $self->{nc}
3526 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3527 } else {
3528 $self->{set_nc}->($self);
3529 }
3530
3531 redo A;
3532 } elsif ($self->{nc} eq 0x0027) { # '
3533
3534 $self->{ct}->{pubid} = ''; # DOCTYPE
3535 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3536
3537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3538 $self->{line_prev} = $self->{line};
3539 $self->{column_prev} = $self->{column};
3540 $self->{column}++;
3541 $self->{nc}
3542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3543 } else {
3544 $self->{set_nc}->($self);
3545 }
3546
3547 redo A;
3548 } elsif ($self->{nc} eq 0x003E) { # >
3549 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550
3551 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552
3553 $self->{state} = DATA_STATE;
3554 $self->{s_kwd} = '';
3555 $self->{ct}->{quirks} = 1;
3556 } else {
3557
3558 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559 }
3560
3561
3562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563 $self->{line_prev} = $self->{line};
3564 $self->{column_prev} = $self->{column};
3565 $self->{column}++;
3566 $self->{nc}
3567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568 } else {
3569 $self->{set_nc}->($self);
3570 }
3571
3572 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3573 redo A;
3574 } elsif ($self->{nc} == -1) {
3575 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576
3577 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578 $self->{state} = DATA_STATE;
3579 $self->{s_kwd} = '';
3580 $self->{ct}->{quirks} = 1;
3581 } else {
3582
3583 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585 }
3586
3587 ## reconsume
3588 return ($self->{ct}); # DOCTYPE
3589 redo A;
3590 } elsif ($self->{is_xml} and
3591 $self->{ct}->{type} == DOCTYPE_TOKEN and
3592 $self->{nc} == 0x005B) { # [
3593
3594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597 $self->{in_subset} = 1;
3598
3599 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600 $self->{line_prev} = $self->{line};
3601 $self->{column_prev} = $self->{column};
3602 $self->{column}++;
3603 $self->{nc}
3604 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605 } else {
3606 $self->{set_nc}->($self);
3607 }
3608
3609 return ($self->{ct}); # DOCTYPE
3610 redo A;
3611 } else {
3612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3613
3614 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615
3616 $self->{ct}->{quirks} = 1;
3617 $self->{state} = BOGUS_DOCTYPE_STATE;
3618 } else {
3619
3620 $self->{state} = BOGUS_MD_STATE;
3621 }
3622
3623
3624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625 $self->{line_prev} = $self->{line};
3626 $self->{column_prev} = $self->{column};
3627 $self->{column}++;
3628 $self->{nc}
3629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3630 } else {
3631 $self->{set_nc}->($self);
3632 }
3633
3634 redo A;
3635 }
3636 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3637 if ($self->{nc} == 0x0022) { # "
3638
3639 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3640
3641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3642 $self->{line_prev} = $self->{line};
3643 $self->{column_prev} = $self->{column};
3644 $self->{column}++;
3645 $self->{nc}
3646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3647 } else {
3648 $self->{set_nc}->($self);
3649 }
3650
3651 redo A;
3652 } elsif ($self->{nc} == 0x003E) { # >
3653 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654
3655 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656
3657 $self->{state} = DATA_STATE;
3658 $self->{s_kwd} = '';
3659 $self->{ct}->{quirks} = 1;
3660 } else {
3661
3662 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663 }
3664
3665
3666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667 $self->{line_prev} = $self->{line};
3668 $self->{column_prev} = $self->{column};
3669 $self->{column}++;
3670 $self->{nc}
3671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3672 } else {
3673 $self->{set_nc}->($self);
3674 }
3675
3676 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3677 redo A;
3678 } elsif ($self->{nc} == -1) {
3679 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680
3681 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682
3683 $self->{state} = DATA_STATE;
3684 $self->{s_kwd} = '';
3685 $self->{ct}->{quirks} = 1;
3686 } else {
3687
3688 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689 }
3690
3691 ## Reconsume.
3692 return ($self->{ct}); # DOCTYPE
3693 redo A;
3694 } else {
3695
3696 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3697 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698 length $self->{ct}->{pubid});
3699
3700 ## Stay in the state
3701
3702 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3703 $self->{line_prev} = $self->{line};
3704 $self->{column_prev} = $self->{column};
3705 $self->{column}++;
3706 $self->{nc}
3707 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3708 } else {
3709 $self->{set_nc}->($self);
3710 }
3711
3712 redo A;
3713 }
3714 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3715 if ($self->{nc} == 0x0027) { # '
3716
3717 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3718
3719 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3720 $self->{line_prev} = $self->{line};
3721 $self->{column_prev} = $self->{column};
3722 $self->{column}++;
3723 $self->{nc}
3724 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3725 } else {
3726 $self->{set_nc}->($self);
3727 }
3728
3729 redo A;
3730 } elsif ($self->{nc} == 0x003E) { # >
3731 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732
3733 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734
3735 $self->{state} = DATA_STATE;
3736 $self->{s_kwd} = '';
3737 $self->{ct}->{quirks} = 1;
3738 } else {
3739
3740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741 }
3742
3743
3744 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745 $self->{line_prev} = $self->{line};
3746 $self->{column_prev} = $self->{column};
3747 $self->{column}++;
3748 $self->{nc}
3749 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3750 } else {
3751 $self->{set_nc}->($self);
3752 }
3753
3754 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3755 redo A;
3756 } elsif ($self->{nc} == -1) {
3757 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758
3759 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760
3761 $self->{state} = DATA_STATE;
3762 $self->{s_kwd} = '';
3763 $self->{ct}->{quirks} = 1;
3764 } else {
3765
3766 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767 }
3768
3769 ## reconsume
3770 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3771 redo A;
3772 } else {
3773
3774 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3775 $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776 length $self->{ct}->{pubid});
3777
3778 ## Stay in the state
3779
3780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3781 $self->{line_prev} = $self->{line};
3782 $self->{column_prev} = $self->{column};
3783 $self->{column}++;
3784 $self->{nc}
3785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3786 } else {
3787 $self->{set_nc}->($self);
3788 }
3789
3790 redo A;
3791 }
3792 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3793 if ($is_space->{$self->{nc}}) {
3794
3795 ## Stay in the state
3796
3797 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3798 $self->{line_prev} = $self->{line};
3799 $self->{column_prev} = $self->{column};
3800 $self->{column}++;
3801 $self->{nc}
3802 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3803 } else {
3804 $self->{set_nc}->($self);
3805 }
3806
3807 redo A;
3808 } elsif ($self->{nc} == 0x0022) { # "
3809
3810 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812
3813 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3814 $self->{line_prev} = $self->{line};
3815 $self->{column_prev} = $self->{column};
3816 $self->{column}++;
3817 $self->{nc}
3818 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3819 } else {
3820 $self->{set_nc}->($self);
3821 }
3822
3823 redo A;
3824 } elsif ($self->{nc} == 0x0027) { # '
3825
3826 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828
3829 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830 $self->{line_prev} = $self->{line};
3831 $self->{column_prev} = $self->{column};
3832 $self->{column}++;
3833 $self->{nc}
3834 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835 } else {
3836 $self->{set_nc}->($self);
3837 }
3838
3839 redo A;
3840 } elsif ($self->{nc} == 0x003E) { # >
3841 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842 if ($self->{is_xml}) {
3843
3844 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845 } else {
3846
3847 }
3848 $self->{state} = DATA_STATE;
3849 $self->{s_kwd} = '';
3850 } else {
3851 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852
3853 } else {
3854
3855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3856 }
3857 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858 }
3859
3860
3861 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862 $self->{line_prev} = $self->{line};
3863 $self->{column_prev} = $self->{column};
3864 $self->{column}++;
3865 $self->{nc}
3866 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3867 } else {
3868 $self->{set_nc}->($self);
3869 }
3870
3871 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3872 redo A;
3873 } elsif ($self->{nc} == -1) {
3874 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875
3876 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877
3878 $self->{state} = DATA_STATE;
3879 $self->{s_kwd} = '';
3880 $self->{ct}->{quirks} = 1;
3881 } else {
3882 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884 }
3885
3886 ## reconsume
3887 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888 redo A;
3889 } elsif ($self->{is_xml} and
3890 $self->{ct}->{type} == DOCTYPE_TOKEN and
3891 $self->{nc} == 0x005B) { # [
3892
3893 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896 $self->{in_subset} = 1;
3897
3898 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899 $self->{line_prev} = $self->{line};
3900 $self->{column_prev} = $self->{column};
3901 $self->{column}++;
3902 $self->{nc}
3903 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904 } else {
3905 $self->{set_nc}->($self);
3906 }
3907
3908 return ($self->{ct}); # DOCTYPE
3909 redo A;
3910 } else {
3911 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3912
3913 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914
3915 $self->{ct}->{quirks} = 1;
3916 $self->{state} = BOGUS_DOCTYPE_STATE;
3917 } else {
3918
3919 $self->{state} = BOGUS_MD_STATE;
3920 }
3921
3922
3923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924 $self->{line_prev} = $self->{line};
3925 $self->{column_prev} = $self->{column};
3926 $self->{column}++;
3927 $self->{nc}
3928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3929 } else {
3930 $self->{set_nc}->($self);
3931 }
3932
3933 redo A;
3934 }
3935 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3936 if ($is_space->{$self->{nc}}) {
3937
3938 ## Stay in the state
3939
3940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3941 $self->{line_prev} = $self->{line};
3942 $self->{column_prev} = $self->{column};
3943 $self->{column}++;
3944 $self->{nc}
3945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3946 } else {
3947 $self->{set_nc}->($self);
3948 }
3949
3950 redo A;
3951 } elsif ($self->{nc} == 0x0022) { # "
3952
3953 $self->{ct}->{sysid} = ''; # DOCTYPE
3954 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3955
3956 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3957 $self->{line_prev} = $self->{line};
3958 $self->{column_prev} = $self->{column};
3959 $self->{column}++;
3960 $self->{nc}
3961 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3962 } else {
3963 $self->{set_nc}->($self);
3964 }
3965
3966 redo A;
3967 } elsif ($self->{nc} == 0x0027) { # '
3968
3969 $self->{ct}->{sysid} = ''; # DOCTYPE
3970 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3971
3972 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3973 $self->{line_prev} = $self->{line};
3974 $self->{column_prev} = $self->{column};
3975 $self->{column}++;
3976 $self->{nc}
3977 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3978 } else {
3979 $self->{set_nc}->($self);
3980 }
3981
3982 redo A;
3983 } elsif ($self->{nc} == 0x003E) { # >
3984 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985
3986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987 $self->{line_prev} = $self->{line};
3988 $self->{column_prev} = $self->{column};
3989 $self->{column}++;
3990 $self->{nc}
3991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992 } else {
3993 $self->{set_nc}->($self);
3994 }
3995
3996
3997 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998
3999 $self->{state} = DATA_STATE;
4000 $self->{s_kwd} = '';
4001 $self->{ct}->{quirks} = 1;
4002 } else {
4003
4004 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005 }
4006
4007 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008 redo A;
4009 } elsif ($self->{nc} == -1) {
4010 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011
4012 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013 $self->{state} = DATA_STATE;
4014 $self->{s_kwd} = '';
4015 $self->{ct}->{quirks} = 1;
4016 } else {
4017
4018 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020 }
4021
4022 ## reconsume
4023 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024 redo A;
4025 } elsif ($self->{is_xml} and
4026 $self->{ct}->{type} == DOCTYPE_TOKEN and
4027 $self->{nc} == 0x005B) { # [
4028
4029 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030
4031 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033 $self->{in_subset} = 1;
4034
4035 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036 $self->{line_prev} = $self->{line};
4037 $self->{column_prev} = $self->{column};
4038 $self->{column}++;
4039 $self->{nc}
4040 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041 } else {
4042 $self->{set_nc}->($self);
4043 }
4044
4045 return ($self->{ct}); # DOCTYPE
4046 redo A;
4047 } else {
4048 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4049
4050 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051
4052 $self->{ct}->{quirks} = 1;
4053 $self->{state} = BOGUS_DOCTYPE_STATE;
4054 } else {
4055
4056 $self->{state} = BOGUS_MD_STATE;
4057 }
4058
4059
4060 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061 $self->{line_prev} = $self->{line};
4062 $self->{column_prev} = $self->{column};
4063 $self->{column}++;
4064 $self->{nc}
4065 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4066 } else {
4067 $self->{set_nc}->($self);
4068 }
4069
4070 redo A;
4071 }
4072 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4073 if ($self->{nc} == 0x0022) { # "
4074
4075 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4076
4077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078 $self->{line_prev} = $self->{line};
4079 $self->{column_prev} = $self->{column};
4080 $self->{column}++;
4081 $self->{nc}
4082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083 } else {
4084 $self->{set_nc}->($self);
4085 }
4086
4087 redo A;
4088 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4089 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090
4091 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092
4093 $self->{state} = DATA_STATE;
4094 $self->{s_kwd} = '';
4095 $self->{ct}->{quirks} = 1;
4096 } else {
4097
4098 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099 }
4100
4101
4102 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103 $self->{line_prev} = $self->{line};
4104 $self->{column_prev} = $self->{column};
4105 $self->{column}++;
4106 $self->{nc}
4107 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108 } else {
4109 $self->{set_nc}->($self);
4110 }
4111
4112 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4113 redo A;
4114 } elsif ($self->{nc} == -1) {
4115 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116
4117 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118
4119 $self->{state} = DATA_STATE;
4120 $self->{s_kwd} = '';
4121 $self->{ct}->{quirks} = 1;
4122 } else {
4123
4124 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125 }
4126
4127 ## reconsume
4128 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4129 redo A;
4130 } else {
4131
4132 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4133 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134 length $self->{ct}->{sysid});
4135
4136 ## Stay in the state
4137
4138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4139 $self->{line_prev} = $self->{line};
4140 $self->{column_prev} = $self->{column};
4141 $self->{column}++;
4142 $self->{nc}
4143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4144 } else {
4145 $self->{set_nc}->($self);
4146 }
4147
4148 redo A;
4149 }
4150 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4151 if ($self->{nc} == 0x0027) { # '
4152
4153 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4154
4155 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4156 $self->{line_prev} = $self->{line};
4157 $self->{column_prev} = $self->{column};
4158 $self->{column}++;
4159 $self->{nc}
4160 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4161 } else {
4162 $self->{set_nc}->($self);
4163 }
4164
4165 redo A;
4166 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167
4168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169
4170 $self->{state} = DATA_STATE;
4171 $self->{s_kwd} = '';
4172
4173 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174 $self->{line_prev} = $self->{line};
4175 $self->{column_prev} = $self->{column};
4176 $self->{column}++;
4177 $self->{nc}
4178 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179 } else {
4180 $self->{set_nc}->($self);
4181 }
4182
4183
4184 $self->{ct}->{quirks} = 1;
4185 return ($self->{ct}); # DOCTYPE
4186
4187 redo A;
4188 } elsif ($self->{nc} == -1) {
4189 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190
4191 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192
4193 $self->{state} = DATA_STATE;
4194 $self->{s_kwd} = '';
4195 $self->{ct}->{quirks} = 1;
4196 } else {
4197
4198 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199 }
4200
4201 ## reconsume
4202 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203 redo A;
4204 } else {
4205
4206 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4207 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208 length $self->{ct}->{sysid});
4209
4210 ## Stay in the state
4211
4212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4213 $self->{line_prev} = $self->{line};
4214 $self->{column_prev} = $self->{column};
4215 $self->{column}++;
4216 $self->{nc}
4217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4218 } else {
4219 $self->{set_nc}->($self);
4220 }
4221
4222 redo A;
4223 }
4224 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225 if ($is_space->{$self->{nc}}) {
4226 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227
4228 $self->{state} = BEFORE_NDATA_STATE;
4229 } else {
4230
4231 ## Stay in the state
4232 }
4233
4234 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235 $self->{line_prev} = $self->{line};
4236 $self->{column_prev} = $self->{column};
4237 $self->{column}++;
4238 $self->{nc}
4239 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4240 } else {
4241 $self->{set_nc}->($self);
4242 }
4243
4244 redo A;
4245 } elsif ($self->{nc} == 0x003E) { # >
4246 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247
4248 $self->{state} = DATA_STATE;
4249 $self->{s_kwd} = '';
4250 } else {
4251
4252 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253 }
4254
4255
4256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257 $self->{line_prev} = $self->{line};
4258 $self->{column_prev} = $self->{column};
4259 $self->{column}++;
4260 $self->{nc}
4261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262 } else {
4263 $self->{set_nc}->($self);
4264 }
4265
4266 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267 redo A;
4268 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269 ($self->{nc} == 0x004E or # N
4270 $self->{nc} == 0x006E)) { # n
4271
4272 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273 $self->{state} = NDATA_STATE;
4274 $self->{kwd} = chr $self->{nc};
4275
4276 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277 $self->{line_prev} = $self->{line};
4278 $self->{column_prev} = $self->{column};
4279 $self->{column}++;
4280 $self->{nc}
4281 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4282 } else {
4283 $self->{set_nc}->($self);
4284 }
4285
4286 redo A;
4287 } elsif ($self->{nc} == -1) {
4288 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289
4290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291 $self->{state} = DATA_STATE;
4292 $self->{s_kwd} = '';
4293 $self->{ct}->{quirks} = 1;
4294 } else {
4295
4296 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298 }
4299
4300 ## reconsume
4301 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302 redo A;
4303 } elsif ($self->{is_xml} and
4304 $self->{ct}->{type} == DOCTYPE_TOKEN and
4305 $self->{nc} == 0x005B) { # [
4306
4307 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309 $self->{in_subset} = 1;
4310
4311 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312 $self->{line_prev} = $self->{line};
4313 $self->{column_prev} = $self->{column};
4314 $self->{column}++;
4315 $self->{nc}
4316 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317 } else {
4318 $self->{set_nc}->($self);
4319 }
4320
4321 return ($self->{ct}); # DOCTYPE
4322 redo A;
4323 } else {
4324 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325
4326 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327
4328 #$self->{ct}->{quirks} = 1;
4329 $self->{state} = BOGUS_DOCTYPE_STATE;
4330 } else {
4331
4332 $self->{state} = BOGUS_MD_STATE;
4333 }
4334
4335
4336 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337 $self->{line_prev} = $self->{line};
4338 $self->{column_prev} = $self->{column};
4339 $self->{column}++;
4340 $self->{nc}
4341 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342 } else {
4343 $self->{set_nc}->($self);
4344 }
4345
4346 redo A;
4347 }
4348 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349 if ($is_space->{$self->{nc}}) {
4350
4351 ## Stay in the state.
4352
4353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354 $self->{line_prev} = $self->{line};
4355 $self->{column_prev} = $self->{column};
4356 $self->{column}++;
4357 $self->{nc}
4358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359 } else {
4360 $self->{set_nc}->($self);
4361 }
4362
4363 redo A;
4364 } elsif ($self->{nc} == 0x003E) { # >
4365
4366 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367
4368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369 $self->{line_prev} = $self->{line};
4370 $self->{column_prev} = $self->{column};
4371 $self->{column}++;
4372 $self->{nc}
4373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374 } else {
4375 $self->{set_nc}->($self);
4376 }
4377
4378 return ($self->{ct}); # ENTITY
4379 redo A;
4380 } elsif ($self->{nc} == 0x004E or # N
4381 $self->{nc} == 0x006E) { # n
4382
4383 $self->{state} = NDATA_STATE;
4384 $self->{kwd} = chr $self->{nc};
4385
4386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387 $self->{line_prev} = $self->{line};
4388 $self->{column_prev} = $self->{column};
4389 $self->{column}++;
4390 $self->{nc}
4391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392 } else {
4393 $self->{set_nc}->($self);
4394 }
4395
4396 redo A;
4397 } elsif ($self->{nc} == -1) {
4398
4399 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401 ## reconsume
4402 return ($self->{ct}); # ENTITY
4403 redo A;
4404 } else {
4405
4406 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407 $self->{state} = BOGUS_MD_STATE;
4408
4409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410 $self->{line_prev} = $self->{line};
4411 $self->{column_prev} = $self->{column};
4412 $self->{column}++;
4413 $self->{nc}
4414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415 } else {
4416 $self->{set_nc}->($self);
4417 }
4418
4419 redo A;
4420 }
4421 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4422 if ($self->{nc} == 0x003E) { # >
4423
4424 $self->{state} = DATA_STATE;
4425 $self->{s_kwd} = '';
4426
4427 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428 $self->{line_prev} = $self->{line};
4429 $self->{column_prev} = $self->{column};
4430 $self->{column}++;
4431 $self->{nc}
4432 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4433 } else {
4434 $self->{set_nc}->($self);
4435 }
4436
4437
4438 return ($self->{ct}); # DOCTYPE
4439
4440 redo A;
4441 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442
4443 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445 $self->{in_subset} = 1;
4446
4447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448 $self->{line_prev} = $self->{line};
4449 $self->{column_prev} = $self->{column};
4450 $self->{column}++;
4451 $self->{nc}
4452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453 } else {
4454 $self->{set_nc}->($self);
4455 }
4456
4457 return ($self->{ct}); # DOCTYPE
4458 redo A;
4459 } elsif ($self->{nc} == -1) {
4460
4461 $self->{state} = DATA_STATE;
4462 $self->{s_kwd} = '';
4463 ## reconsume
4464
4465 return ($self->{ct}); # DOCTYPE
4466
4467 redo A;
4468 } else {
4469
4470 my $s = '';
4471 $self->{read_until}->($s, q{>[}, 0);
4472
4473 ## Stay in the state
4474
4475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476 $self->{line_prev} = $self->{line};
4477 $self->{column_prev} = $self->{column};
4478 $self->{column}++;
4479 $self->{nc}
4480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481 } else {
4482 $self->{set_nc}->($self);
4483 }
4484
4485 redo A;
4486 }
4487 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4488 ## NOTE: "CDATA section state" in the state is jointly implemented
4489 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490 ## and |CDATA_SECTION_MSE2_STATE|.
4491
4492 ## XML5: "CDATA state".
4493
4494 if ($self->{nc} == 0x005D) { # ]
4495
4496 $self->{state} = CDATA_SECTION_MSE1_STATE;
4497
4498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499 $self->{line_prev} = $self->{line};
4500 $self->{column_prev} = $self->{column};
4501 $self->{column}++;
4502 $self->{nc}
4503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504 } else {
4505 $self->{set_nc}->($self);
4506 }
4507
4508 redo A;
4509 } elsif ($self->{nc} == -1) {
4510 if ($self->{is_xml}) {
4511
4512 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513 } else {
4514
4515 }
4516
4517 $self->{state} = DATA_STATE;
4518 $self->{s_kwd} = '';
4519 ## Reconsume.
4520 if (length $self->{ct}->{data}) { # character
4521
4522 return ($self->{ct}); # character
4523 } else {
4524
4525 ## No token to emit. $self->{ct} is discarded.
4526 }
4527 redo A;
4528 } else {
4529
4530 $self->{ct}->{data} .= chr $self->{nc};
4531 $self->{read_until}->($self->{ct}->{data},
4532 q<]>,
4533 length $self->{ct}->{data});
4534
4535 ## Stay in the state.
4536
4537 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4538 $self->{line_prev} = $self->{line};
4539 $self->{column_prev} = $self->{column};
4540 $self->{column}++;
4541 $self->{nc}
4542 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4543 } else {
4544 $self->{set_nc}->($self);
4545 }
4546
4547 redo A;
4548 }
4549
4550 ## ISSUE: "text tokens" in spec.
4551 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552 ## XML5: "CDATA bracket state".
4553
4554 if ($self->{nc} == 0x005D) { # ]
4555
4556 $self->{state} = CDATA_SECTION_MSE2_STATE;
4557
4558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4559 $self->{line_prev} = $self->{line};
4560 $self->{column_prev} = $self->{column};
4561 $self->{column}++;
4562 $self->{nc}
4563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4564 } else {
4565 $self->{set_nc}->($self);
4566 }
4567
4568 redo A;
4569 } else {
4570
4571 ## XML5: If EOF, "]" is not appended and changed to the data state.
4572 $self->{ct}->{data} .= ']';
4573 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574 ## Reconsume.
4575 redo A;
4576 }
4577 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578 ## XML5: "CDATA end state".
4579
4580 if ($self->{nc} == 0x003E) { # >
4581 $self->{state} = DATA_STATE;
4582 $self->{s_kwd} = '';
4583
4584 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585 $self->{line_prev} = $self->{line};
4586 $self->{column_prev} = $self->{column};
4587 $self->{column}++;
4588 $self->{nc}
4589 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4590 } else {
4591 $self->{set_nc}->($self);
4592 }
4593
4594 if (length $self->{ct}->{data}) { # character
4595
4596 return ($self->{ct}); # character
4597 } else {
4598
4599 ## No token to emit. $self->{ct} is discarded.
4600 }
4601 redo A;
4602 } elsif ($self->{nc} == 0x005D) { # ]
4603 # character
4604 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4605 ## Stay in the state.
4606
4607 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608 $self->{line_prev} = $self->{line};
4609 $self->{column_prev} = $self->{column};
4610 $self->{column}++;
4611 $self->{nc}
4612 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613 } else {
4614 $self->{set_nc}->($self);
4615 }
4616
4617 redo A;
4618 } else {
4619
4620 $self->{ct}->{data} .= ']]'; # character
4621 $self->{state} = CDATA_SECTION_STATE;
4622 ## Reconsume. ## XML5: Emit.
4623 redo A;
4624 }
4625 } elsif ($self->{state} == ENTITY_STATE) {
4626 if ($is_space->{$self->{nc}} or
4627 {
4628 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4629 $self->{entity_add} => 1,
4630 }->{$self->{nc}}) {
4631
4632 ## Don't consume
4633 ## No error
4634 ## Return nothing.
4635 #
4636 } elsif ($self->{nc} == 0x0023) { # #
4637
4638 $self->{state} = ENTITY_HASH_STATE;
4639 $self->{kwd} = '#';
4640
4641 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642 $self->{line_prev} = $self->{line};
4643 $self->{column_prev} = $self->{column};
4644 $self->{column}++;
4645 $self->{nc}
4646 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4647 } else {
4648 $self->{set_nc}->($self);
4649 }
4650
4651 redo A;
4652 } elsif ((0x0041 <= $self->{nc} and
4653 $self->{nc} <= 0x005A) or # A..Z
4654 (0x0061 <= $self->{nc} and
4655 $self->{nc} <= 0x007A)) { # a..z
4656
4657 require Whatpm::_NamedEntityList;
4658 $self->{state} = ENTITY_NAME_STATE;
4659 $self->{kwd} = chr $self->{nc};
4660 $self->{entity__value} = $self->{kwd};
4661 $self->{entity__match} = 0;
4662
4663 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4664 $self->{line_prev} = $self->{line};
4665 $self->{column_prev} = $self->{column};
4666 $self->{column}++;
4667 $self->{nc}
4668 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4669 } else {
4670 $self->{set_nc}->($self);
4671 }
4672
4673 redo A;
4674 } else {
4675
4676 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4677 ## Return nothing.
4678 #
4679 }
4680
4681 ## NOTE: No character is consumed by the "consume a character
4682 ## reference" algorithm. In other word, there is an "&" character
4683 ## that does not introduce a character reference, which would be
4684 ## appended to the parent element or the attribute value in later
4685 ## process of the tokenizer.
4686
4687 if ($self->{prev_state} == DATA_STATE) {
4688
4689 $self->{state} = $self->{prev_state};
4690 $self->{s_kwd} = '';
4691 ## Reconsume.
4692 return ({type => CHARACTER_TOKEN, data => '&',
4693 line => $self->{line_prev},
4694 column => $self->{column_prev},
4695 });
4696 redo A;
4697 } else {
4698
4699 $self->{ca}->{value} .= '&';
4700 $self->{state} = $self->{prev_state};
4701 $self->{s_kwd} = '';
4702 ## Reconsume.
4703 redo A;
4704 }
4705 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4706 if ($self->{nc} == 0x0078) { # x
4707
4708 $self->{state} = HEXREF_X_STATE;
4709 $self->{kwd} .= chr $self->{nc};
4710
4711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4712 $self->{line_prev} = $self->{line};
4713 $self->{column_prev} = $self->{column};
4714 $self->{column}++;
4715 $self->{nc}
4716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4717 } else {
4718 $self->{set_nc}->($self);
4719 }
4720
4721 redo A;
4722 } elsif ($self->{nc} == 0x0058) { # X
4723
4724 if ($self->{is_xml}) {
4725 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4726 }
4727 $self->{state} = HEXREF_X_STATE;
4728 $self->{kwd} .= chr $self->{nc};
4729
4730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4731 $self->{line_prev} = $self->{line};
4732 $self->{column_prev} = $self->{column};
4733 $self->{column}++;
4734 $self->{nc}
4735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4736 } else {
4737 $self->{set_nc}->($self);
4738 }
4739
4740 redo A;
4741 } elsif (0x0030 <= $self->{nc} and
4742 $self->{nc} <= 0x0039) { # 0..9
4743
4744 $self->{state} = NCR_NUM_STATE;
4745 $self->{kwd} = $self->{nc} - 0x0030;
4746
4747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4748 $self->{line_prev} = $self->{line};
4749 $self->{column_prev} = $self->{column};
4750 $self->{column}++;
4751 $self->{nc}
4752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4753 } else {
4754 $self->{set_nc}->($self);
4755 }
4756
4757 redo A;
4758 } else {
4759 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4760 line => $self->{line_prev},
4761 column => $self->{column_prev} - 1);
4762
4763 ## NOTE: According to the spec algorithm, nothing is returned,
4764 ## and then "&#" is appended to the parent element or the attribute
4765 ## value in the later processing.
4766
4767 if ($self->{prev_state} == DATA_STATE) {
4768
4769 $self->{state} = $self->{prev_state};
4770 $self->{s_kwd} = '';
4771 ## Reconsume.
4772 return ({type => CHARACTER_TOKEN,
4773 data => '&#',
4774 line => $self->{line_prev},
4775 column => $self->{column_prev} - 1,
4776 });
4777 redo A;
4778 } else {
4779
4780 $self->{ca}->{value} .= '&#';
4781 $self->{state} = $self->{prev_state};
4782 $self->{s_kwd} = '';
4783 ## Reconsume.
4784 redo A;
4785 }
4786 }
4787 } elsif ($self->{state} == NCR_NUM_STATE) {
4788 if (0x0030 <= $self->{nc} and
4789 $self->{nc} <= 0x0039) { # 0..9
4790
4791 $self->{kwd} *= 10;
4792 $self->{kwd} += $self->{nc} - 0x0030;
4793
4794 ## Stay in the state.
4795
4796 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4797 $self->{line_prev} = $self->{line};
4798 $self->{column_prev} = $self->{column};
4799 $self->{column}++;
4800 $self->{nc}
4801 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4802 } else {
4803 $self->{set_nc}->($self);
4804 }
4805
4806 redo A;
4807 } elsif ($self->{nc} == 0x003B) { # ;
4808
4809
4810 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4811 $self->{line_prev} = $self->{line};
4812 $self->{column_prev} = $self->{column};
4813 $self->{column}++;
4814 $self->{nc}
4815 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4816 } else {
4817 $self->{set_nc}->($self);
4818 }
4819
4820 #
4821 } else {
4822
4823 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4824 ## Reconsume.
4825 #
4826 }
4827
4828 my $code = $self->{kwd};
4829 my $l = $self->{line_prev};
4830 my $c = $self->{column_prev};
4831 if ($charref_map->{$code}) {
4832
4833 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4834 text => (sprintf 'U+%04X', $code),
4835 line => $l, column => $c);
4836 $code = $charref_map->{$code};
4837 } elsif ($code > 0x10FFFF) {
4838
4839 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4840 text => (sprintf 'U-%08X', $code),
4841 line => $l, column => $c);
4842 $code = 0xFFFD;
4843 }
4844
4845 if ($self->{prev_state} == DATA_STATE) {
4846
4847 $self->{state} = $self->{prev_state};
4848 $self->{s_kwd} = '';
4849 ## Reconsume.
4850 return ({type => CHARACTER_TOKEN, data => chr $code,
4851 has_reference => 1,
4852 line => $l, column => $c,
4853 });
4854 redo A;
4855 } else {
4856
4857 $self->{ca}->{value} .= chr $code;
4858 $self->{ca}->{has_reference} = 1;
4859 $self->{state} = $self->{prev_state};
4860 $self->{s_kwd} = '';
4861 ## Reconsume.
4862 redo A;
4863 }
4864 } elsif ($self->{state} == HEXREF_X_STATE) {
4865 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4866 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4867 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4868 # 0..9, A..F, a..f
4869
4870 $self->{state} = HEXREF_HEX_STATE;
4871 $self->{kwd} = 0;
4872 ## Reconsume.
4873 redo A;
4874 } else {
4875 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4876 line => $self->{line_prev},
4877 column => $self->{column_prev} - 2);
4878
4879 ## NOTE: According to the spec algorithm, nothing is returned,
4880 ## and then "&#" followed by "X" or "x" is appended to the parent
4881 ## element or the attribute value in the later processing.
4882
4883 if ($self->{prev_state} == DATA_STATE) {
4884
4885 $self->{state} = $self->{prev_state};
4886 $self->{s_kwd} = '';
4887 ## Reconsume.
4888 return ({type => CHARACTER_TOKEN,
4889 data => '&' . $self->{kwd},
4890 line => $self->{line_prev},
4891 column => $self->{column_prev} - length $self->{kwd},
4892 });
4893 redo A;
4894 } else {
4895
4896 $self->{ca}->{value} .= '&' . $self->{kwd};
4897 $self->{state} = $self->{prev_state};
4898 $self->{s_kwd} = '';
4899 ## Reconsume.
4900 redo A;
4901 }
4902 }
4903 } elsif ($self->{state} == HEXREF_HEX_STATE) {
4904 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4905 # 0..9
4906
4907 $self->{kwd} *= 0x10;
4908 $self->{kwd} += $self->{nc} - 0x0030;
4909 ## Stay in the state.
4910
4911 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4912 $self->{line_prev} = $self->{line};
4913 $self->{column_prev} = $self->{column};
4914 $self->{column}++;
4915 $self->{nc}
4916 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4917 } else {
4918 $self->{set_nc}->($self);
4919 }
4920
4921 redo A;
4922 } elsif (0x0061 <= $self->{nc} and
4923 $self->{nc} <= 0x0066) { # a..f
4924
4925 $self->{kwd} *= 0x10;
4926 $self->{kwd} += $self->{nc} - 0x0060 + 9;
4927 ## Stay in the state.
4928
4929 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4930 $self->{line_prev} = $self->{line};
4931 $self->{column_prev} = $self->{column};
4932 $self->{column}++;
4933 $self->{nc}
4934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4935 } else {
4936 $self->{set_nc}->($self);
4937 }
4938
4939 redo A;
4940 } elsif (0x0041 <= $self->{nc} and
4941 $self->{nc} <= 0x0046) { # A..F
4942
4943 $self->{kwd} *= 0x10;
4944 $self->{kwd} += $self->{nc} - 0x0040 + 9;
4945 ## Stay in the state.
4946
4947 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4948 $self->{line_prev} = $self->{line};
4949 $self->{column_prev} = $self->{column};
4950 $self->{column}++;
4951 $self->{nc}
4952 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4953 } else {
4954 $self->{set_nc}->($self);
4955 }
4956
4957 redo A;
4958 } elsif ($self->{nc} == 0x003B) { # ;
4959
4960
4961 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4962 $self->{line_prev} = $self->{line};
4963 $self->{column_prev} = $self->{column};
4964 $self->{column}++;
4965 $self->{nc}
4966 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4967 } else {
4968 $self->{set_nc}->($self);
4969 }
4970
4971 #
4972 } else {
4973
4974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4975 line => $self->{line},
4976 column => $self->{column});
4977 ## Reconsume.
4978 #
4979 }
4980
4981 my $code = $self->{kwd};
4982 my $l = $self->{line_prev};
4983 my $c = $self->{column_prev};
4984 if ($charref_map->{$code}) {
4985
4986 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4987 text => (sprintf 'U+%04X', $code),
4988 line => $l, column => $c);
4989 $code = $charref_map->{$code};
4990 } elsif ($code > 0x10FFFF) {
4991
4992 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4993 text => (sprintf 'U-%08X', $code),
4994 line => $l, column => $c);
4995 $code = 0xFFFD;
4996 }
4997
4998 if ($self->{prev_state} == DATA_STATE) {
4999
5000 $self->{state} = $self->{prev_state};
5001 $self->{s_kwd} = '';
5002 ## Reconsume.
5003 return ({type => CHARACTER_TOKEN, data => chr $code,
5004 has_reference => 1,
5005 line => $l, column => $c,
5006 });
5007 redo A;
5008 } else {
5009
5010 $self->{ca}->{value} .= chr $code;
5011 $self->{ca}->{has_reference} = 1;
5012 $self->{state} = $self->{prev_state};
5013 $self->{s_kwd} = '';
5014 ## Reconsume.
5015 redo A;
5016 }
5017 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5018 if ((0x0041 <= $self->{nc} and # a
5019 $self->{nc} <= 0x005A) or # x
5020 (0x0061 <= $self->{nc} and # a
5021 $self->{nc} <= 0x007A) or # z
5022 (0x0030 <= $self->{nc} and # 0
5023 $self->{nc} <= 0x0039) or # 9
5024 $self->{nc} == 0x003B) { # ;
5025 our $EntityChar;
5026 $self->{kwd} .= chr $self->{nc};
5027 if (defined $EntityChar->{$self->{kwd}} or
5028 $self->{ge}->{$self->{kwd}}) {
5029 if ($self->{nc} == 0x003B) { # ;
5030 if (defined $self->{ge}->{$self->{kwd}}) {
5031 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5032
5033 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5034 } else {
5035 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5036
5037 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5038 value => $self->{kwd});
5039 } else {
5040
5041 }
5042 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5043 }
5044 } else {
5045 if ($self->{is_xml}) {
5046
5047 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5048 value => $self->{kwd},
5049 level => {
5050 'amp;' => $self->{level}->{warn},
5051 'quot;' => $self->{level}->{warn},
5052 'lt;' => $self->{level}->{warn},
5053 'gt;' => $self->{level}->{warn},
5054 'apos;' => $self->{level}->{warn},
5055 }->{$self->{kwd}} ||
5056 $self->{level}->{must});
5057 } else {
5058
5059 }
5060 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5061 }
5062 $self->{entity__match} = 1;
5063
5064 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5065 $self->{line_prev} = $self->{line};
5066 $self->{column_prev} = $self->{column};
5067 $self->{column}++;
5068 $self->{nc}
5069 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5070 } else {
5071 $self->{set_nc}->($self);
5072 }
5073
5074 #
5075 } else {
5076
5077 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5078 $self->{entity__match} = -1;
5079 ## Stay in the state.
5080
5081 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5082 $self->{line_prev} = $self->{line};
5083 $self->{column_prev} = $self->{column};
5084 $self->{column}++;
5085 $self->{nc}
5086 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5087 } else {
5088 $self->{set_nc}->($self);
5089 }
5090
5091 redo A;
5092 }
5093 } else {
5094
5095 $self->{entity__value} .= chr $self->{nc};
5096 $self->{entity__match} *= 2;
5097 ## Stay in the state.
5098
5099 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5100 $self->{line_prev} = $self->{line};
5101 $self->{column_prev} = $self->{column};
5102 $self->{column}++;
5103 $self->{nc}
5104 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5105 } else {
5106 $self->{set_nc}->($self);
5107 }
5108
5109 redo A;
5110 }
5111 }
5112
5113 my $data;
5114 my $has_ref;
5115 if ($self->{entity__match} > 0) {
5116
5117 $data = $self->{entity__value};
5118 $has_ref = 1;
5119 #
5120 } elsif ($self->{entity__match} < 0) {
5121 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5122 if ($self->{prev_state} != DATA_STATE and # in attribute
5123 $self->{entity__match} < -1) {
5124
5125 $data = '&' . $self->{kwd};
5126 #
5127 } else {
5128
5129 $data = $self->{entity__value};
5130 $has_ref = 1;
5131 #
5132 }
5133 } else {
5134
5135 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5136 line => $self->{line_prev},
5137 column => $self->{column_prev} - length $self->{kwd});
5138 $data = '&' . $self->{kwd};
5139 #
5140 }
5141
5142 ## NOTE: In these cases, when a character reference is found,
5143 ## it is consumed and a character token is returned, or, otherwise,
5144 ## nothing is consumed and returned, according to the spec algorithm.
5145 ## In this implementation, anything that has been examined by the
5146 ## tokenizer is appended to the parent element or the attribute value
5147 ## as string, either literal string when no character reference or
5148 ## entity-replaced string otherwise, in this stage, since any characters
5149 ## that would not be consumed are appended in the data state or in an
5150 ## appropriate attribute value state anyway.
5151
5152 if ($self->{prev_state} == DATA_STATE) {
5153
5154 $self->{state} = $self->{prev_state};
5155 $self->{s_kwd} = '';
5156 ## Reconsume.
5157 return ({type => CHARACTER_TOKEN,
5158 data => $data,
5159 has_reference => $has_ref,
5160 line => $self->{line_prev},
5161 column => $self->{column_prev} + 1 - length $self->{kwd},
5162 });
5163 redo A;
5164 } else {
5165
5166 $self->{ca}->{value} .= $data;
5167 $self->{ca}->{has_reference} = 1 if $has_ref;
5168 $self->{state} = $self->{prev_state};
5169 $self->{s_kwd} = '';
5170 ## Reconsume.
5171 redo A;
5172 }
5173
5174 ## XML-only states
5175
5176 } elsif ($self->{state} == PI_STATE) {
5177 ## XML5: "Pi state" and "DOCTYPE pi state".
5178
5179 if ($is_space->{$self->{nc}} or
5180 $self->{nc} == 0x003F or # ?
5181 $self->{nc} == -1) {
5182 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5183 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5184 ## "DOCTYPE pi state": Parse error, switch to the "data
5185 ## state".
5186 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5187 line => $self->{line_prev},
5188 column => $self->{column_prev}
5189 - 1 * ($self->{nc} != -1));
5190 $self->{state} = BOGUS_COMMENT_STATE;
5191 ## Reconsume.
5192 $self->{ct} = {type => COMMENT_TOKEN,
5193 data => '?',
5194 line => $self->{line_prev},
5195 column => $self->{column_prev}
5196 - 1 * ($self->{nc} != -1),
5197 };
5198 redo A;
5199 } else {
5200 ## XML5: "DOCTYPE pi state": Stay in the state.
5201 $self->{ct} = {type => PI_TOKEN,
5202 target => chr $self->{nc},
5203 data => '',
5204 line => $self->{line_prev},
5205 column => $self->{column_prev} - 1,
5206 };
5207 $self->{state} = PI_TARGET_STATE;
5208
5209 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5210 $self->{line_prev} = $self->{line};
5211 $self->{column_prev} = $self->{column};
5212 $self->{column}++;
5213 $self->{nc}
5214 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5215 } else {
5216 $self->{set_nc}->($self);
5217 }
5218
5219 redo A;
5220 }
5221 } elsif ($self->{state} == PI_TARGET_STATE) {
5222 if ($is_space->{$self->{nc}}) {
5223 $self->{state} = PI_TARGET_AFTER_STATE;
5224
5225 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5226 $self->{line_prev} = $self->{line};
5227 $self->{column_prev} = $self->{column};
5228 $self->{column}++;
5229 $self->{nc}
5230 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5231 } else {
5232 $self->{set_nc}->($self);
5233 }
5234
5235 redo A;
5236 } elsif ($self->{nc} == -1) {
5237 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5238 if ($self->{in_subset}) {
5239 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5240 } else {
5241 $self->{state} = DATA_STATE;
5242 $self->{s_kwd} = '';
5243 }
5244 ## Reconsume.
5245 return ($self->{ct}); # pi
5246 redo A;
5247 } elsif ($self->{nc} == 0x003F) { # ?
5248 $self->{state} = PI_AFTER_STATE;
5249
5250 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5251 $self->{line_prev} = $self->{line};
5252 $self->{column_prev} = $self->{column};
5253 $self->{column}++;
5254 $self->{nc}
5255 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5256 } else {
5257 $self->{set_nc}->($self);
5258 }
5259
5260 redo A;
5261 } else {
5262 ## XML5: typo ("tag name" -> "target")
5263 $self->{ct}->{target} .= chr $self->{nc}; # pi
5264
5265 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5266 $self->{line_prev} = $self->{line};
5267 $self->{column_prev} = $self->{column};
5268 $self->{column}++;
5269 $self->{nc}
5270 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5271 } else {
5272 $self->{set_nc}->($self);
5273 }
5274
5275 redo A;
5276 }
5277 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5278 if ($is_space->{$self->{nc}}) {
5279 ## Stay in the state.
5280
5281 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5282 $self->{line_prev} = $self->{line};
5283 $self->{column_prev} = $self->{column};
5284 $self->{column}++;
5285 $self->{nc}
5286 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5287 } else {
5288 $self->{set_nc}->($self);
5289 }
5290
5291 redo A;
5292 } else {
5293 $self->{state} = PI_DATA_STATE;
5294 ## Reprocess.
5295 redo A;
5296 }
5297 } elsif ($self->{state} == PI_DATA_STATE) {
5298 if ($self->{nc} == 0x003F) { # ?
5299 $self->{state} = PI_DATA_AFTER_STATE;
5300
5301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5302 $self->{line_prev} = $self->{line};
5303 $self->{column_prev} = $self->{column};
5304 $self->{column}++;
5305 $self->{nc}
5306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5307 } else {
5308 $self->{set_nc}->($self);
5309 }
5310
5311 redo A;
5312 } elsif ($self->{nc} == -1) {
5313 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5314 if ($self->{in_subset}) {
5315 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5316 } else {
5317 $self->{state} = DATA_STATE;
5318 $self->{s_kwd} = '';
5319 }
5320 ## Reprocess.
5321 return ($self->{ct}); # pi
5322 redo A;
5323 } else {
5324 $self->{ct}->{data} .= chr $self->{nc}; # pi
5325 $self->{read_until}->($self->{ct}->{data}, q[?],
5326 length $self->{ct}->{data});
5327 ## Stay in the state.
5328
5329 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5330 $self->{line_prev} = $self->{line};
5331 $self->{column_prev} = $self->{column};
5332 $self->{column}++;
5333 $self->{nc}
5334 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5335 } else {
5336 $self->{set_nc}->($self);
5337 }
5338
5339 ## Reprocess.
5340 redo A;
5341 }
5342 } elsif ($self->{state} == PI_AFTER_STATE) {
5343 ## XML5: Part of "Pi after state".
5344
5345 if ($self->{nc} == 0x003E) { # >
5346 if ($self->{in_subset}) {
5347 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5348 } else {
5349 $self->{state} = DATA_STATE;
5350 $self->{s_kwd} = '';
5351 }
5352
5353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5354 $self->{line_prev} = $self->{line};
5355 $self->{column_prev} = $self->{column};
5356 $self->{column}++;
5357 $self->{nc}
5358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5359 } else {
5360 $self->{set_nc}->($self);
5361 }
5362
5363 return ($self->{ct}); # pi
5364 redo A;
5365 } elsif ($self->{nc} == 0x003F) { # ?
5366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5367 line => $self->{line_prev},
5368 column => $self->{column_prev}); ## XML5: no error
5369 $self->{ct}->{data} .= '?';
5370 $self->{state} = PI_DATA_AFTER_STATE;
5371
5372 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5373 $self->{line_prev} = $self->{line};
5374 $self->{column_prev} = $self->{column};
5375 $self->{column}++;
5376 $self->{nc}
5377 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5378 } else {
5379 $self->{set_nc}->($self);
5380 }
5381
5382 redo A;
5383 } else {
5384 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5385 line => $self->{line_prev},
5386 column => $self->{column_prev}
5387 + 1 * ($self->{nc} == -1)); ## XML5: no error
5388 $self->{ct}->{data} .= '?'; ## XML5: not appended
5389 $self->{state} = PI_DATA_STATE;
5390 ## Reprocess.
5391 redo A;
5392 }
5393 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5394 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5395
5396 if ($self->{nc} == 0x003E) { # >
5397 if ($self->{in_subset}) {
5398 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5399 } else {
5400 $self->{state} = DATA_STATE;
5401 $self->{s_kwd} = '';
5402 }
5403
5404 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5405 $self->{line_prev} = $self->{line};
5406 $self->{column_prev} = $self->{column};
5407 $self->{column}++;
5408 $self->{nc}
5409 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5410 } else {
5411 $self->{set_nc}->($self);
5412 }
5413
5414 return ($self->{ct}); # pi
5415 redo A;
5416 } elsif ($self->{nc} == 0x003F) { # ?
5417 $self->{ct}->{data} .= '?';
5418 ## Stay in the state.
5419
5420 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5421 $self->{line_prev} = $self->{line};
5422 $self->{column_prev} = $self->{column};
5423 $self->{column}++;
5424 $self->{nc}
5425 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5426 } else {
5427 $self->{set_nc}->($self);
5428 }
5429
5430 redo A;
5431 } else {
5432 $self->{ct}->{data} .= '?'; ## XML5: not appended
5433 $self->{state} = PI_DATA_STATE;
5434 ## Reprocess.
5435 redo A;
5436 }
5437
5438 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5439 if ($self->{nc} == 0x003C) { # <
5440 $self->{state} = DOCTYPE_TAG_STATE;
5441
5442 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5443 $self->{line_prev} = $self->{line};
5444 $self->{column_prev} = $self->{column};
5445 $self->{column}++;
5446 $self->{nc}
5447 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5448 } else {
5449 $self->{set_nc}->($self);
5450 }
5451
5452 redo A;
5453 } elsif ($self->{nc} == 0x0025) { # %
5454 ## XML5: Not defined yet.
5455
5456 ## TODO:
5457
5458 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5459 $self->{line_prev} = $self->{line};
5460 $self->{column_prev} = $self->{column};
5461 $self->{column}++;
5462 $self->{nc}
5463 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5464 } else {
5465 $self->{set_nc}->($self);
5466 }
5467
5468 redo A;
5469 } elsif ($self->{nc} == 0x005D) { # ]
5470 delete $self->{in_subset};
5471 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5472
5473 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474 $self->{line_prev} = $self->{line};
5475 $self->{column_prev} = $self->{column};
5476 $self->{column}++;
5477 $self->{nc}
5478 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479 } else {
5480 $self->{set_nc}->($self);
5481 }
5482
5483 redo A;
5484 } elsif ($is_space->{$self->{nc}}) {
5485 ## Stay in the state.
5486
5487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5488 $self->{line_prev} = $self->{line};
5489 $self->{column_prev} = $self->{column};
5490 $self->{column}++;
5491 $self->{nc}
5492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5493 } else {
5494 $self->{set_nc}->($self);
5495 }
5496
5497 redo A;
5498 } elsif ($self->{nc} == -1) {
5499 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5500 delete $self->{in_subset};
5501 $self->{state} = DATA_STATE;
5502 $self->{s_kwd} = '';
5503 ## Reconsume.
5504 return ({type => END_OF_DOCTYPE_TOKEN});
5505 redo A;
5506 } else {
5507 unless ($self->{internal_subset_tainted}) {
5508 ## XML5: No parse error.
5509 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5510 $self->{internal_subset_tainted} = 1;
5511 }
5512 ## Stay in the state.
5513
5514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5515 $self->{line_prev} = $self->{line};
5516 $self->{column_prev} = $self->{column};
5517 $self->{column}++;
5518 $self->{nc}
5519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5520 } else {
5521 $self->{set_nc}->($self);
5522 }
5523
5524 redo A;
5525 }
5526 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5527 if ($self->{nc} == 0x003E) { # >
5528 $self->{state} = DATA_STATE;
5529 $self->{s_kwd} = '';
5530
5531 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5532 $self->{line_prev} = $self->{line};
5533 $self->{column_prev} = $self->{column};
5534 $self->{column}++;
5535 $self->{nc}
5536 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5537 } else {
5538 $self->{set_nc}->($self);
5539 }
5540
5541 return ({type => END_OF_DOCTYPE_TOKEN});
5542 redo A;
5543 } elsif ($self->{nc} == -1) {
5544 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5545 $self->{state} = DATA_STATE;
5546 $self->{s_kwd} = '';
5547 ## Reconsume.
5548 return ({type => END_OF_DOCTYPE_TOKEN});
5549 redo A;
5550 } else {
5551 ## XML5: No parse error and stay in the state.
5552 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5553
5554 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5555
5556 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5557 $self->{line_prev} = $self->{line};
5558 $self->{column_prev} = $self->{column};
5559 $self->{column}++;
5560 $self->{nc}
5561 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5562 } else {
5563 $self->{set_nc}->($self);
5564 }
5565
5566 redo A;
5567 }
5568 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5569 if ($self->{nc} == 0x003E) { # >
5570 $self->{state} = DATA_STATE;
5571 $self->{s_kwd} = '';
5572
5573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5574 $self->{line_prev} = $self->{line};
5575 $self->{column_prev} = $self->{column};
5576 $self->{column}++;
5577 $self->{nc}
5578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5579 } else {
5580 $self->{set_nc}->($self);
5581 }
5582
5583 return ({type => END_OF_DOCTYPE_TOKEN});
5584 redo A;
5585 } elsif ($self->{nc} == -1) {
5586 $self->{state} = DATA_STATE;
5587 $self->{s_kwd} = '';
5588 ## Reconsume.
5589 return ({type => END_OF_DOCTYPE_TOKEN});
5590 redo A;
5591 } else {
5592 ## Stay in the state.
5593
5594 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5595 $self->{line_prev} = $self->{line};
5596 $self->{column_prev} = $self->{column};
5597 $self->{column}++;
5598 $self->{nc}
5599 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5600 } else {
5601 $self->{set_nc}->($self);
5602 }
5603
5604 redo A;
5605 }
5606 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5607 if ($self->{nc} == 0x0021) { # !
5608 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5609
5610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5611 $self->{line_prev} = $self->{line};
5612 $self->{column_prev} = $self->{column};
5613 $self->{column}++;
5614 $self->{nc}
5615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5616 } else {
5617 $self->{set_nc}->($self);
5618 }
5619
5620 redo A;
5621 } elsif ($self->{nc} == 0x003F) { # ?
5622 $self->{state} = PI_STATE;
5623
5624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625 $self->{line_prev} = $self->{line};
5626 $self->{column_prev} = $self->{column};
5627 $self->{column}++;
5628 $self->{nc}
5629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630 } else {
5631 $self->{set_nc}->($self);
5632 }
5633
5634 redo A;
5635 } elsif ($self->{nc} == -1) {
5636 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5637 $self->{state} = DATA_STATE;
5638 $self->{s_kwd} = '';
5639 ## Reconsume.
5640 redo A;
5641 } else {
5642 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5643 line => $self->{line_prev},
5644 column => $self->{column_prev});
5645 $self->{state} = BOGUS_COMMENT_STATE;
5646 $self->{ct} = {type => COMMENT_TOKEN,
5647 data => '',
5648 }; ## NOTE: Will be discarded.
5649
5650 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5651 $self->{line_prev} = $self->{line};
5652 $self->{column_prev} = $self->{column};
5653 $self->{column}++;
5654 $self->{nc}
5655 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5656 } else {
5657 $self->{set_nc}->($self);
5658 }
5659
5660 redo A;
5661 }
5662 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5663 ## XML5: "DOCTYPE markup declaration state".
5664
5665 if ($self->{nc} == 0x002D) { # -
5666 $self->{state} = MD_HYPHEN_STATE;
5667
5668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5669 $self->{line_prev} = $self->{line};
5670 $self->{column_prev} = $self->{column};
5671 $self->{column}++;
5672 $self->{nc}
5673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5674 } else {
5675 $self->{set_nc}->($self);
5676 }
5677
5678 redo A;
5679 } elsif ($self->{nc} == 0x0045 or # E
5680 $self->{nc} == 0x0065) { # e
5681 $self->{state} = MD_E_STATE;
5682 $self->{kwd} = chr $self->{nc};
5683
5684 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5685 $self->{line_prev} = $self->{line};
5686 $self->{column_prev} = $self->{column};
5687 $self->{column}++;
5688 $self->{nc}
5689 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5690 } else {
5691 $self->{set_nc}->($self);
5692 }
5693
5694 redo A;
5695 } elsif ($self->{nc} == 0x0041 or # A
5696 $self->{nc} == 0x0061) { # a
5697 $self->{state} = MD_ATTLIST_STATE;
5698 $self->{kwd} = chr $self->{nc};
5699
5700 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5701 $self->{line_prev} = $self->{line};
5702 $self->{column_prev} = $self->{column};
5703 $self->{column}++;
5704 $self->{nc}
5705 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5706 } else {
5707 $self->{set_nc}->($self);
5708 }
5709
5710 redo A;
5711 } elsif ($self->{nc} == 0x004E or # N
5712 $self->{nc} == 0x006E) { # n
5713 $self->{state} = MD_NOTATION_STATE;
5714 $self->{kwd} = chr $self->{nc};
5715
5716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5717 $self->{line_prev} = $self->{line};
5718 $self->{column_prev} = $self->{column};
5719 $self->{column}++;
5720 $self->{nc}
5721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5722 } else {
5723 $self->{set_nc}->($self);
5724 }
5725
5726 redo A;
5727 } else {
5728 #
5729 }
5730
5731 ## XML5: No parse error.
5732 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5733 line => $self->{line_prev},
5734 column => $self->{column_prev} - 1);
5735 ## Reconsume.
5736 $self->{state} = BOGUS_COMMENT_STATE;
5737 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5738 redo A;
5739 } elsif ($self->{state} == MD_E_STATE) {
5740 if ($self->{nc} == 0x004E or # N
5741 $self->{nc} == 0x006E) { # n
5742 $self->{state} = MD_ENTITY_STATE;
5743 $self->{kwd} .= chr $self->{nc};
5744
5745 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746 $self->{line_prev} = $self->{line};
5747 $self->{column_prev} = $self->{column};
5748 $self->{column}++;
5749 $self->{nc}
5750 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751 } else {
5752 $self->{set_nc}->($self);
5753 }
5754
5755 redo A;
5756 } elsif ($self->{nc} == 0x004C or # L
5757 $self->{nc} == 0x006C) { # l
5758 ## XML5: <!ELEMENT> not supported.
5759 $self->{state} = MD_ELEMENT_STATE;
5760 $self->{kwd} .= chr $self->{nc};
5761
5762 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5763 $self->{line_prev} = $self->{line};
5764 $self->{column_prev} = $self->{column};
5765 $self->{column}++;
5766 $self->{nc}
5767 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5768 } else {
5769 $self->{set_nc}->($self);
5770 }
5771
5772 redo A;
5773 } else {
5774 ## XML5: No parse error.
5775 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5776 line => $self->{line_prev},
5777 column => $self->{column_prev} - 2
5778 + 1 * ($self->{nc} == -1));
5779 ## Reconsume.
5780 $self->{state} = BOGUS_COMMENT_STATE;
5781 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5782 redo A;
5783 }
5784 } elsif ($self->{state} == MD_ENTITY_STATE) {
5785 if ($self->{nc} == [
5786 undef,
5787 undef,
5788 0x0054, # T
5789 0x0049, # I
5790 0x0054, # T
5791 ]->[length $self->{kwd}] or
5792 $self->{nc} == [
5793 undef,
5794 undef,
5795 0x0074, # t
5796 0x0069, # i
5797 0x0074, # t
5798 ]->[length $self->{kwd}]) {
5799 ## Stay in the state.
5800 $self->{kwd} .= chr $self->{nc};
5801
5802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5803 $self->{line_prev} = $self->{line};
5804 $self->{column_prev} = $self->{column};
5805 $self->{column}++;
5806 $self->{nc}
5807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5808 } else {
5809 $self->{set_nc}->($self);
5810 }
5811
5812 redo A;
5813 } elsif ((length $self->{kwd}) == 5 and
5814 ($self->{nc} == 0x0059 or # Y
5815 $self->{nc} == 0x0079)) { # y
5816 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5817 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5818 text => 'ENTITY',
5819 line => $self->{line_prev},
5820 column => $self->{column_prev} - 4);
5821 }
5822 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5823 line => $self->{line_prev},
5824 column => $self->{column_prev} - 6};
5825 $self->{state} = DOCTYPE_MD_STATE;
5826
5827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5828 $self->{line_prev} = $self->{line};
5829 $self->{column_prev} = $self->{column};
5830 $self->{column}++;
5831 $self->{nc}
5832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5833 } else {
5834 $self->{set_nc}->($self);
5835 }
5836
5837 redo A;
5838 } else {
5839 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5840 line => $self->{line_prev},
5841 column => $self->{column_prev} - 1
5842 - (length $self->{kwd})
5843 + 1 * ($self->{nc} == -1));
5844 $self->{state} = BOGUS_COMMENT_STATE;
5845 ## Reconsume.
5846 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5847 redo A;
5848 }
5849 } elsif ($self->{state} == MD_ELEMENT_STATE) {
5850 if ($self->{nc} == [
5851 undef,
5852 undef,
5853 0x0045, # E
5854 0x004D, # M
5855 0x0045, # E
5856 0x004E, # N
5857 ]->[length $self->{kwd}] or
5858 $self->{nc} == [
5859 undef,
5860 undef,
5861 0x0065, # e
5862 0x006D, # m
5863 0x0065, # e
5864 0x006E, # n
5865 ]->[length $self->{kwd}]) {
5866 ## Stay in the state.
5867 $self->{kwd} .= chr $self->{nc};
5868
5869 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5870 $self->{line_prev} = $self->{line};
5871 $self->{column_prev} = $self->{column};
5872 $self->{column}++;
5873 $self->{nc}
5874 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5875 } else {
5876 $self->{set_nc}->($self);
5877 }
5878
5879 redo A;
5880 } elsif ((length $self->{kwd}) == 6 and
5881 ($self->{nc} == 0x0054 or # T
5882 $self->{nc} == 0x0074)) { # t
5883 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5884 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5885 text => 'ELEMENT',
5886 line => $self->{line_prev},
5887 column => $self->{column_prev} - 5);
5888 }
5889 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5890 line => $self->{line_prev},
5891 column => $self->{column_prev} - 6};
5892 $self->{state} = DOCTYPE_MD_STATE;
5893
5894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5895 $self->{line_prev} = $self->{line};
5896 $self->{column_prev} = $self->{column};
5897 $self->{column}++;
5898 $self->{nc}
5899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5900 } else {
5901 $self->{set_nc}->($self);
5902 }
5903
5904 redo A;
5905 } else {
5906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5907 line => $self->{line_prev},
5908 column => $self->{column_prev} - 1
5909 - (length $self->{kwd})
5910 + 1 * ($self->{nc} == -1));
5911 $self->{state} = BOGUS_COMMENT_STATE;
5912 ## Reconsume.
5913 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5914 redo A;
5915 }
5916 } elsif ($self->{state} == MD_ATTLIST_STATE) {
5917 if ($self->{nc} == [
5918 undef,
5919 0x0054, # T
5920 0x0054, # T
5921 0x004C, # L
5922 0x0049, # I
5923 0x0053, # S
5924 ]->[length $self->{kwd}] or
5925 $self->{nc} == [
5926 undef,
5927 0x0074, # t
5928 0x0074, # t
5929 0x006C, # l
5930 0x0069, # i
5931 0x0073, # s
5932 ]->[length $self->{kwd}]) {
5933 ## Stay in the state.
5934 $self->{kwd} .= chr $self->{nc};
5935
5936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5937 $self->{line_prev} = $self->{line};
5938 $self->{column_prev} = $self->{column};
5939 $self->{column}++;
5940 $self->{nc}
5941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5942 } else {
5943 $self->{set_nc}->($self);
5944 }
5945
5946 redo A;
5947 } elsif ((length $self->{kwd}) == 6 and
5948 ($self->{nc} == 0x0054 or # T
5949 $self->{nc} == 0x0074)) { # t
5950 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5952 text => 'ATTLIST',
5953 line => $self->{line_prev},
5954 column => $self->{column_prev} - 5);
5955 }
5956 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5957 attrdefs => [],
5958 line => $self->{line_prev},
5959 column => $self->{column_prev} - 6};
5960 $self->{state} = DOCTYPE_MD_STATE;
5961
5962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5963 $self->{line_prev} = $self->{line};
5964 $self->{column_prev} = $self->{column};
5965 $self->{column}++;
5966 $self->{nc}
5967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5968 } else {
5969 $self->{set_nc}->($self);
5970 }
5971
5972 redo A;
5973 } else {
5974 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5975 line => $self->{line_prev},
5976 column => $self->{column_prev} - 1
5977 - (length $self->{kwd})
5978 + 1 * ($self->{nc} == -1));
5979 $self->{state} = BOGUS_COMMENT_STATE;
5980 ## Reconsume.
5981 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5982 redo A;
5983 }
5984 } elsif ($self->{state} == MD_NOTATION_STATE) {
5985 if ($self->{nc} == [
5986 undef,
5987 0x004F, # O
5988 0x0054, # T
5989 0x0041, # A
5990 0x0054, # T
5991 0x0049, # I
5992 0x004F, # O
5993 ]->[length $self->{kwd}] or
5994 $self->{nc} == [
5995 undef,
5996 0x006F, # o
5997 0x0074, # t
5998 0x0061, # a
5999 0x0074, # t
6000 0x0069, # i
6001 0x006F, # o
6002 ]->[length $self->{kwd}]) {
6003 ## Stay in the state.
6004 $self->{kwd} .= chr $self->{nc};
6005
6006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6007 $self->{line_prev} = $self->{line};
6008 $self->{column_prev} = $self->{column};
6009 $self->{column}++;
6010 $self->{nc}
6011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6012 } else {
6013 $self->{set_nc}->($self);
6014 }
6015
6016 redo A;
6017 } elsif ((length $self->{kwd}) == 7 and
6018 ($self->{nc} == 0x004E or # N
6019 $self->{nc} == 0x006E)) { # n
6020 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6021 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6022 text => 'NOTATION',
6023 line => $self->{line_prev},
6024 column => $self->{column_prev} - 6);
6025 }
6026 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6027 line => $self->{line_prev},
6028 column => $self->{column_prev} - 6};
6029 $self->{state} = DOCTYPE_MD_STATE;
6030
6031 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032 $self->{line_prev} = $self->{line};
6033 $self->{column_prev} = $self->{column};
6034 $self->{column}++;
6035 $self->{nc}
6036 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037 } else {
6038 $self->{set_nc}->($self);
6039 }
6040
6041 redo A;
6042 } else {
6043 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6044 line => $self->{line_prev},
6045 column => $self->{column_prev} - 1
6046 - (length $self->{kwd})
6047 + 1 * ($self->{nc} == -1));
6048 $self->{state} = BOGUS_COMMENT_STATE;
6049 ## Reconsume.
6050 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6051 redo A;
6052 }
6053 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6054 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6055 ## "DOCTYPE NOTATION state".
6056
6057 if ($is_space->{$self->{nc}}) {
6058 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6059 $self->{state} = BEFORE_MD_NAME_STATE;
6060
6061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6062 $self->{line_prev} = $self->{line};
6063 $self->{column_prev} = $self->{column};
6064 $self->{column}++;
6065 $self->{nc}
6066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6067 } else {
6068 $self->{set_nc}->($self);
6069 }
6070
6071 redo A;
6072 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6073 $self->{nc} == 0x0025) { # %
6074 ## XML5: Switch to the "DOCTYPE bogus comment state".
6075 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6076 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6077
6078 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6079 $self->{line_prev} = $self->{line};
6080 $self->{column_prev} = $self->{column};
6081 $self->{column}++;
6082 $self->{nc}
6083 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6084 } else {
6085 $self->{set_nc}->($self);
6086 }
6087
6088 redo A;
6089 } elsif ($self->{nc} == -1) {
6090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6091 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6092 ## Reconsume.
6093 redo A;
6094 } elsif ($self->{nc} == 0x003E) { # >
6095 ## XML5: Switch to the "DOCTYPE bogus comment state".
6096 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6097 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6098
6099 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6100 $self->{line_prev} = $self->{line};
6101 $self->{column_prev} = $self->{column};
6102 $self->{column}++;
6103 $self->{nc}
6104 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6105 } else {
6106 $self->{set_nc}->($self);
6107 }
6108
6109 redo A;
6110 } else {
6111 ## XML5: Switch to the "DOCTYPE bogus comment state".
6112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6113 $self->{state} = BEFORE_MD_NAME_STATE;
6114 redo A;
6115 }
6116 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6117 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6118 ## before state", "DOCTYPE ATTLIST name before state".
6119
6120 if ($is_space->{$self->{nc}}) {
6121 ## Stay in the state.
6122
6123 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6124 $self->{line_prev} = $self->{line};
6125 $self->{column_prev} = $self->{column};
6126 $self->{column}++;
6127 $self->{nc}
6128 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6129 } else {
6130 $self->{set_nc}->($self);
6131 }
6132
6133 redo A;
6134 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6135 $self->{nc} == 0x0025) { # %
6136 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6137
6138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139 $self->{line_prev} = $self->{line};
6140 $self->{column_prev} = $self->{column};
6141 $self->{column}++;
6142 $self->{nc}
6143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144 } else {
6145 $self->{set_nc}->($self);
6146 }
6147
6148 redo A;
6149 } elsif ($self->{nc} == 0x003E) { # >
6150 ## XML5: Same as "Anything else".
6151 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6152 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6153
6154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6155 $self->{line_prev} = $self->{line};
6156 $self->{column_prev} = $self->{column};
6157 $self->{column}++;
6158 $self->{nc}
6159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6160 } else {
6161 $self->{set_nc}->($self);
6162 }
6163
6164 redo A;
6165 } elsif ($self->{nc} == -1) {
6166 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6167 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6168 ## Reconsume.
6169 redo A;
6170 } else {
6171 ## XML5: [ATTLIST] Not defined yet.
6172 $self->{ct}->{name} .= chr $self->{nc};
6173 $self->{state} = MD_NAME_STATE;
6174
6175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6176 $self->{line_prev} = $self->{line};
6177 $self->{column_prev} = $self->{column};
6178 $self->{column}++;
6179 $self->{nc}
6180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6181 } else {
6182 $self->{set_nc}->($self);
6183 }
6184
6185 redo A;
6186 }
6187 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6188 if ($is_space->{$self->{nc}}) {
6189 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6190 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6191 $self->{state} = BEFORE_MD_NAME_STATE;
6192
6193 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6194 $self->{line_prev} = $self->{line};
6195 $self->{column_prev} = $self->{column};
6196 $self->{column}++;
6197 $self->{nc}
6198 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6199 } else {
6200 $self->{set_nc}->($self);
6201 }
6202
6203 redo A;
6204 } elsif ($self->{nc} == 0x003E) { # >
6205 ## XML5: Same as "Anything else".
6206 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6207 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6208
6209 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6210 $self->{line_prev} = $self->{line};
6211 $self->{column_prev} = $self->{column};
6212 $self->{column}++;
6213 $self->{nc}
6214 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6215 } else {
6216 $self->{set_nc}->($self);
6217 }
6218
6219 redo A;
6220 } elsif ($self->{nc} == -1) {
6221 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6222 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6223 ## Reconsume.
6224 redo A;
6225 } else {
6226 ## XML5: No parse error.
6227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6228 $self->{state} = BOGUS_COMMENT_STATE;
6229 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6230 ## Reconsume.
6231 redo A;
6232 }
6233 } elsif ($self->{state} == MD_NAME_STATE) {
6234 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6235
6236 if ($is_space->{$self->{nc}}) {
6237 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6238 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6239 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6240 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6241 } else { # ENTITY/NOTATION
6242 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6243 }
6244
6245 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6246 $self->{line_prev} = $self->{line};
6247 $self->{column_prev} = $self->{column};
6248 $self->{column}++;
6249 $self->{nc}
6250 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6251 } else {
6252 $self->{set_nc}->($self);
6253 }
6254
6255 redo A;
6256 } elsif ($self->{nc} == 0x003E) { # >
6257 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6258 #
6259 } else {
6260 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6261 }
6262 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263
6264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265 $self->{line_prev} = $self->{line};
6266 $self->{column_prev} = $self->{column};
6267 $self->{column}++;
6268 $self->{nc}
6269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270 } else {
6271 $self->{set_nc}->($self);
6272 }
6273
6274 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6275 redo A;
6276 } elsif ($self->{nc} == -1) {
6277 ## XML5: [ATTLIST] No parse error.
6278 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6279 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6280 ## Reconsume.
6281 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6282 redo A;
6283 } else {
6284 ## XML5: [ATTLIST] Not defined yet.
6285 $self->{ct}->{name} .= chr $self->{nc};
6286 ## Stay in the state.
6287
6288 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6289 $self->{line_prev} = $self->{line};
6290 $self->{column_prev} = $self->{column};
6291 $self->{column}++;
6292 $self->{nc}
6293 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6294 } else {
6295 $self->{set_nc}->($self);
6296 }
6297
6298 redo A;
6299 }
6300 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6301 if ($is_space->{$self->{nc}}) {
6302 ## Stay in the state.
6303
6304 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6305 $self->{line_prev} = $self->{line};
6306 $self->{column_prev} = $self->{column};
6307 $self->{column}++;
6308 $self->{nc}
6309 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6310 } else {
6311 $self->{set_nc}->($self);
6312 }
6313
6314 redo A;
6315 } elsif ($self->{nc} == 0x003E) { # >
6316 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6317
6318 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6319 $self->{line_prev} = $self->{line};
6320 $self->{column_prev} = $self->{column};
6321 $self->{column}++;
6322 $self->{nc}
6323 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6324 } else {
6325 $self->{set_nc}->($self);
6326 }
6327
6328 return ($self->{ct}); # ATTLIST
6329 redo A;
6330 } elsif ($self->{nc} == -1) {
6331 ## XML5: No parse error.
6332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6333 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6334 return ($self->{ct});
6335 redo A;
6336 } else {
6337 ## XML5: Not defined yet.
6338 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6339 tokens => [],
6340 line => $self->{line}, column => $self->{column}};
6341 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6342
6343 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6344 $self->{line_prev} = $self->{line};
6345 $self->{column_prev} = $self->{column};
6346 $self->{column}++;
6347 $self->{nc}
6348 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6349 } else {
6350 $self->{set_nc}->($self);
6351 }
6352
6353 redo A;
6354 }
6355 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6356 if ($is_space->{$self->{nc}}) {
6357 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6358
6359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6360 $self->{line_prev} = $self->{line};
6361 $self->{column_prev} = $self->{column};
6362 $self->{column}++;
6363 $self->{nc}
6364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6365 } else {
6366 $self->{set_nc}->($self);
6367 }
6368
6369 redo A;
6370 } elsif ($self->{nc} == 0x003E) { # >
6371 ## XML5: Same as "anything else".
6372 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6373 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6374
6375 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6376 $self->{line_prev} = $self->{line};
6377 $self->{column_prev} = $self->{column};
6378 $self->{column}++;
6379 $self->{nc}
6380 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6381 } else {
6382 $self->{set_nc}->($self);
6383 }
6384
6385 return ($self->{ct}); # ATTLIST
6386 redo A;
6387 } elsif ($self->{nc} == 0x0028) { # (
6388 ## XML5: Same as "anything else".
6389 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6390 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6391
6392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6393 $self->{line_prev} = $self->{line};
6394 $self->{column_prev} = $self->{column};
6395 $self->{column}++;
6396 $self->{nc}
6397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6398 } else {
6399 $self->{set_nc}->($self);
6400 }
6401
6402 redo A;
6403 } elsif ($self->{nc} == -1) {
6404 ## XML5: No parse error.
6405 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6406 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6407
6408 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6409 $self->{line_prev} = $self->{line};
6410 $self->{column_prev} = $self->{column};
6411 $self->{column}++;
6412 $self->{nc}
6413 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6414 } else {
6415 $self->{set_nc}->($self);
6416 }
6417
6418 return ($self->{ct}); # ATTLIST
6419 redo A;
6420 } else {
6421 ## XML5: Not defined yet.
6422 $self->{ca}->{name} .= chr $self->{nc};
6423 ## Stay in the state.
6424
6425 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6426 $self->{line_prev} = $self->{line};
6427 $self->{column_prev} = $self->{column};
6428 $self->{column}++;
6429 $self->{nc}
6430 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6431 } else {
6432 $self->{set_nc}->($self);
6433 }
6434
6435 redo A;
6436 }
6437 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6438 if ($is_space->{$self->{nc}}) {
6439 ## Stay in the state.
6440
6441 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6442 $self->{line_prev} = $self->{line};
6443 $self->{column_prev} = $self->{column};
6444 $self->{column}++;
6445 $self->{nc}
6446 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6447 } else {
6448 $self->{set_nc}->($self);
6449 }
6450
6451 redo A;
6452 } elsif ($self->{nc} == 0x003E) { # >
6453 ## XML5: Same as "anything else".
6454 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6455 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6456
6457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6458 $self->{line_prev} = $self->{line};
6459 $self->{column_prev} = $self->{column};
6460 $self->{column}++;
6461 $self->{nc}
6462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6463 } else {
6464 $self->{set_nc}->($self);
6465 }
6466
6467 return ($self->{ct}); # ATTLIST
6468 redo A;
6469 } elsif ($self->{nc} == 0x0028) { # (
6470 ## XML5: Same as "anything else".
6471 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6472
6473 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6474 $self->{line_prev} = $self->{line};
6475 $self->{column_prev} = $self->{column};
6476 $self->{column}++;
6477 $self->{nc}
6478 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6479 } else {
6480 $self->{set_nc}->($self);
6481 }
6482
6483 redo A;
6484 } elsif ($self->{nc} == -1) {
6485 ## XML5: No parse error.
6486 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6487 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6488
6489 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6490 $self->{line_prev} = $self->{line};
6491 $self->{column_prev} = $self->{column};
6492 $self->{column}++;
6493 $self->{nc}
6494 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6495 } else {
6496 $self->{set_nc}->($self);
6497 }
6498
6499 return ($self->{ct});
6500 redo A;
6501 } else {
6502 ## XML5: Not defined yet.
6503 $self->{ca}->{type} = chr $self->{nc};
6504 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6505
6506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6507 $self->{line_prev} = $self->{line};
6508 $self->{column_prev} = $self->{column};
6509 $self->{column}++;
6510 $self->{nc}
6511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6512 } else {
6513 $self->{set_nc}->($self);
6514 }
6515
6516 redo A;
6517 }
6518 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6519 if ($is_space->{$self->{nc}}) {
6520 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6521
6522 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523 $self->{line_prev} = $self->{line};
6524 $self->{column_prev} = $self->{column};
6525 $self->{column}++;
6526 $self->{nc}
6527 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528 } else {
6529 $self->{set_nc}->($self);
6530 }
6531
6532 redo A;
6533 } elsif ($self->{nc} == 0x0023) { # #
6534 ## XML5: Same as "anything else".
6535 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6536 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6537
6538 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539 $self->{line_prev} = $self->{line};
6540 $self->{column_prev} = $self->{column};
6541 $self->{column}++;
6542 $self->{nc}
6543 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544 } else {
6545 $self->{set_nc}->($self);
6546 }
6547
6548 redo A;
6549 } elsif ($self->{nc} == 0x0022) { # "
6550 ## XML5: Same as "anything else".
6551 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6552 $self->{ca}->{value} = '';
6553 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6554
6555 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6556 $self->{line_prev} = $self->{line};
6557 $self->{column_prev} = $self->{column};
6558 $self->{column}++;
6559 $self->{nc}
6560 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6561 } else {
6562 $self->{set_nc}->($self);
6563 }
6564
6565 redo A;
6566 } elsif ($self->{nc} == 0x0027) { # '
6567 ## XML5: Same as "anything else".
6568 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6569 $self->{ca}->{value} = '';
6570 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6571
6572 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6573 $self->{line_prev} = $self->{line};
6574 $self->{column_prev} = $self->{column};
6575 $self->{column}++;
6576 $self->{nc}
6577 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6578 } else {
6579 $self->{set_nc}->($self);
6580 }
6581
6582 redo A;
6583 } elsif ($self->{nc} == 0x003E) { # >
6584 ## XML5: Same as "anything else".
6585 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6586 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6587
6588 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6589 $self->{line_prev} = $self->{line};
6590 $self->{column_prev} = $self->{column};
6591 $self->{column}++;
6592 $self->{nc}
6593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6594 } else {
6595 $self->{set_nc}->($self);
6596 }
6597
6598 return ($self->{ct}); # ATTLIST
6599 redo A;
6600 } elsif ($self->{nc} == 0x0028) { # (
6601 ## XML5: Same as "anything else".
6602 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6603 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6604
6605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6606 $self->{line_prev} = $self->{line};
6607 $self->{column_prev} = $self->{column};
6608 $self->{column}++;
6609 $self->{nc}
6610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6611 } else {
6612 $self->{set_nc}->($self);
6613 }
6614
6615 redo A;
6616 } elsif ($self->{nc} == -1) {
6617 ## XML5: No parse error.
6618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6619 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6620
6621 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6622 $self->{line_prev} = $self->{line};
6623 $self->{column_prev} = $self->{column};
6624 $self->{column}++;
6625 $self->{nc}
6626 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6627 } else {
6628 $self->{set_nc}->($self);
6629 }
6630
6631 return ($self->{ct});
6632 redo A;
6633 } else {
6634 ## XML5: Not defined yet.
6635 $self->{ca}->{type} .= chr $self->{nc};
6636 ## Stay in the state.
6637
6638 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6639 $self->{line_prev} = $self->{line};
6640 $self->{column_prev} = $self->{column};
6641 $self->{column}++;
6642 $self->{nc}
6643 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6644 } else {
6645 $self->{set_nc}->($self);
6646 }
6647
6648 redo A;
6649 }
6650 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6651 if ($is_space->{$self->{nc}}) {
6652 ## Stay in the state.
6653
6654 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6655 $self->{line_prev} = $self->{line};
6656 $self->{column_prev} = $self->{column};
6657 $self->{column}++;
6658 $self->{nc}
6659 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6660 } else {
6661 $self->{set_nc}->($self);
6662 }
6663
6664 redo A;
6665 } elsif ($self->{nc} == 0x0028) { # (
6666 ## XML5: Same as "anything else".
6667 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6668
6669 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6670 $self->{line_prev} = $self->{line};
6671 $self->{column_prev} = $self->{column};
6672 $self->{column}++;
6673 $self->{nc}
6674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6675 } else {
6676 $self->{set_nc}->($self);
6677 }
6678
6679 redo A;
6680 } elsif ($self->{nc} == 0x0023) { # #
6681 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6682
6683 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6684 $self->{line_prev} = $self->{line};
6685 $self->{column_prev} = $self->{column};
6686 $self->{column}++;
6687 $self->{nc}
6688 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6689 } else {
6690 $self->{set_nc}->($self);
6691 }
6692
6693 redo A;
6694 } elsif ($self->{nc} == 0x0022) { # "
6695 ## XML5: Same as "anything else".
6696 $self->{ca}->{value} = '';
6697 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6698
6699 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6700 $self->{line_prev} = $self->{line};
6701 $self->{column_prev} = $self->{column};
6702 $self->{column}++;
6703 $self->{nc}
6704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6705 } else {
6706 $self->{set_nc}->($self);
6707 }
6708
6709 redo A;
6710 } elsif ($self->{nc} == 0x0027) { # '
6711 ## XML5: Same as "anything else".
6712 $self->{ca}->{value} = '';
6713 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6714
6715 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716 $self->{line_prev} = $self->{line};
6717 $self->{column_prev} = $self->{column};
6718 $self->{column}++;
6719 $self->{nc}
6720 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721 } else {
6722 $self->{set_nc}->($self);
6723 }
6724
6725 redo A;
6726 } elsif ($self->{nc} == 0x003E) { # >
6727 ## XML5: Same as "anything else".
6728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6729 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6730
6731 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732 $self->{line_prev} = $self->{line};
6733 $self->{column_prev} = $self->{column};
6734 $self->{column}++;
6735 $self->{nc}
6736 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737 } else {
6738 $self->{set_nc}->($self);
6739 }
6740
6741 return ($self->{ct}); # ATTLIST
6742 redo A;
6743 } elsif ($self->{nc} == -1) {
6744 ## XML5: No parse error.
6745 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6746 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6747
6748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6749 $self->{line_prev} = $self->{line};
6750 $self->{column_prev} = $self->{column};
6751 $self->{column}++;
6752 $self->{nc}
6753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6754 } else {
6755 $self->{set_nc}->($self);
6756 }
6757
6758 return ($self->{ct});
6759 redo A;
6760 } else {
6761 ## XML5: Switch to the "DOCTYPE bogus comment state".
6762 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6763 $self->{ca}->{value} = '';
6764 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6765 ## Reconsume.
6766 redo A;
6767 }
6768 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6769 if ($is_space->{$self->{nc}}) {
6770 ## Stay in the state.
6771
6772 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6773 $self->{line_prev} = $self->{line};
6774 $self->{column_prev} = $self->{column};
6775 $self->{column}++;
6776 $self->{nc}
6777 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6778 } else {
6779 $self->{set_nc}->($self);
6780 }
6781
6782 redo A;
6783 } elsif ($self->{nc} == 0x007C) { # |
6784 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6785 ## Stay in the state.
6786
6787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6788 $self->{line_prev} = $self->{line};
6789 $self->{column_prev} = $self->{column};
6790 $self->{column}++;
6791 $self->{nc}
6792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6793 } else {
6794 $self->{set_nc}->($self);
6795 }
6796
6797 redo A;
6798 } elsif ($self->{nc} == 0x0029) { # )
6799 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6800 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6801
6802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6803 $self->{line_prev} = $self->{line};
6804 $self->{column_prev} = $self->{column};
6805 $self->{column}++;
6806 $self->{nc}
6807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6808 } else {
6809 $self->{set_nc}->($self);
6810 }
6811
6812 redo A;
6813 } elsif ($self->{nc} == 0x003E) { # >
6814 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6815 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6816
6817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6818 $self->{line_prev} = $self->{line};
6819 $self->{column_prev} = $self->{column};
6820 $self->{column}++;
6821 $self->{nc}
6822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6823 } else {
6824 $self->{set_nc}->($self);
6825 }
6826
6827 return ($self->{ct}); # ATTLIST
6828 redo A;
6829 } elsif ($self->{nc} == -1) {
6830 ## XML5: No parse error.
6831 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6832 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6833
6834 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6835 $self->{line_prev} = $self->{line};
6836 $self->{column_prev} = $self->{column};
6837 $self->{column}++;
6838 $self->{nc}
6839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6840 } else {
6841 $self->{set_nc}->($self);
6842 }
6843
6844 return ($self->{ct});
6845 redo A;
6846 } else {
6847 push @{$self->{ca}->{tokens}}, chr $self->{nc};
6848 $self->{state} = ALLOWED_TOKEN_STATE;
6849
6850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6851 $self->{line_prev} = $self->{line};
6852 $self->{column_prev} = $self->{column};
6853 $self->{column}++;
6854 $self->{nc}
6855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6856 } else {
6857 $self->{set_nc}->($self);
6858 }
6859
6860 redo A;
6861 }
6862 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6863 if ($is_space->{$self->{nc}}) {
6864 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6865
6866 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6867 $self->{line_prev} = $self->{line};
6868 $self->{column_prev} = $self->{column};
6869 $self->{column}++;
6870 $self->{nc}
6871 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6872 } else {
6873 $self->{set_nc}->($self);
6874 }
6875
6876 redo A;
6877 } elsif ($self->{nc} == 0x007C) { # |
6878 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6879
6880 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6881 $self->{line_prev} = $self->{line};
6882 $self->{column_prev} = $self->{column};
6883 $self->{column}++;
6884 $self->{nc}
6885 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6886 } else {
6887 $self->{set_nc}->($self);
6888 }
6889
6890 redo A;
6891 } elsif ($self->{nc} == 0x0029) { # )
6892 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6893
6894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6895 $self->{line_prev} = $self->{line};
6896 $self->{column_prev} = $self->{column};
6897 $self->{column}++;
6898 $self->{nc}
6899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6900 } else {
6901 $self->{set_nc}->($self);
6902 }
6903
6904 redo A;
6905 } elsif ($self->{nc} == 0x003E) { # >
6906 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6907 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6908
6909 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6910 $self->{line_prev} = $self->{line};
6911 $self->{column_prev} = $self->{column};
6912 $self->{column}++;
6913 $self->{nc}
6914 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6915 } else {
6916 $self->{set_nc}->($self);
6917 }
6918
6919 return ($self->{ct}); # ATTLIST
6920 redo A;
6921 } elsif ($self->{nc} == -1) {
6922 ## XML5: No parse error.
6923 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6924 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6925
6926 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6927 $self->{line_prev} = $self->{line};
6928 $self->{column_prev} = $self->{column};
6929 $self->{column}++;
6930 $self->{nc}
6931 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6932 } else {
6933 $self->{set_nc}->($self);
6934 }
6935
6936 return ($self->{ct});
6937 redo A;
6938 } else {
6939 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6940 ## Stay in the state.
6941
6942 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6943 $self->{line_prev} = $self->{line};
6944 $self->{column_prev} = $self->{column};
6945 $self->{column}++;
6946 $self->{nc}
6947 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6948 } else {
6949 $self->{set_nc}->($self);
6950 }
6951
6952 redo A;
6953 }
6954 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6955 if ($is_space->{$self->{nc}}) {
6956 ## Stay in the state.
6957
6958 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6959 $self->{line_prev} = $self->{line};
6960 $self->{column_prev} = $self->{column};
6961 $self->{column}++;
6962 $self->{nc}
6963 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6964 } else {
6965 $self->{set_nc}->($self);
6966 }
6967
6968 redo A;
6969 } elsif ($self->{nc} == 0x007C) { # |
6970 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6971
6972 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6973 $self->{line_prev} = $self->{line};
6974 $self->{column_prev} = $self->{column};
6975 $self->{column}++;
6976 $self->{nc}
6977 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6978 } else {
6979 $self->{set_nc}->($self);
6980 }
6981
6982 redo A;
6983 } elsif ($self->{nc} == 0x0029) { # )
6984 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6985
6986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6987 $self->{line_prev} = $self->{line};
6988 $self->{column_prev} = $self->{column};
6989 $self->{column}++;
6990 $self->{nc}
6991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6992 } else {
6993 $self->{set_nc}->($self);
6994 }
6995
6996 redo A;
6997 } elsif ($self->{nc} == 0x003E) { # >
6998 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6999 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7000
7001 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002 $self->{line_prev} = $self->{line};
7003 $self->{column_prev} = $self->{column};
7004 $self->{column}++;
7005 $self->{nc}
7006 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007 } else {
7008 $self->{set_nc}->($self);
7009 }
7010
7011 return ($self->{ct}); # ATTLIST
7012 redo A;
7013 } elsif ($self->{nc} == -1) {
7014 ## XML5: No parse error.
7015 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7016 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7017
7018 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7019 $self->{line_prev} = $self->{line};
7020 $self->{column_prev} = $self->{column};
7021 $self->{column}++;
7022 $self->{nc}
7023 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7024 } else {
7025 $self->{set_nc}->($self);
7026 }
7027
7028 return ($self->{ct});
7029 redo A;
7030 } else {
7031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7032 line => $self->{line_prev},
7033 column => $self->{column_prev});
7034 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7035 $self->{state} = ALLOWED_TOKEN_STATE;
7036
7037 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7038 $self->{line_prev} = $self->{line};
7039 $self->{column_prev} = $self->{column};
7040 $self->{column}++;
7041 $self->{nc}
7042 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7043 } else {
7044 $self->{set_nc}->($self);
7045 }
7046
7047 redo A;
7048 }
7049 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7050 if ($is_space->{$self->{nc}}) {
7051 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7052
7053 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7054 $self->{line_prev} = $self->{line};
7055 $self->{column_prev} = $self->{column};
7056 $self->{column}++;
7057 $self->{nc}
7058 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7059 } else {
7060 $self->{set_nc}->($self);
7061 }
7062
7063 redo A;
7064 } elsif ($self->{nc} == 0x0023) { # #
7065 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7066 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7067
7068 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069 $self->{line_prev} = $self->{line};
7070 $self->{column_prev} = $self->{column};
7071 $self->{column}++;
7072 $self->{nc}
7073 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074 } else {
7075 $self->{set_nc}->($self);
7076 }
7077
7078 redo A;
7079 } elsif ($self->{nc} == 0x0022) { # "
7080 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7081 $self->{ca}->{value} = '';
7082 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7083
7084 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085 $self->{line_prev} = $self->{line};
7086 $self->{column_prev} = $self->{column};
7087 $self->{column}++;
7088 $self->{nc}
7089 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090 } else {
7091 $self->{set_nc}->($self);
7092 }
7093
7094 redo A;
7095 } elsif ($self->{nc} == 0x0027) { # '
7096 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7097 $self->{ca}->{value} = '';
7098 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7099
7100 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101 $self->{line_prev} = $self->{line};
7102 $self->{column_prev} = $self->{column};
7103 $self->{column}++;
7104 $self->{nc}
7105 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106 } else {
7107 $self->{set_nc}->($self);
7108 }
7109
7110 redo A;
7111 } elsif ($self->{nc} == 0x003E) { # >
7112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7113 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7114
7115 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7116 $self->{line_prev} = $self->{line};
7117 $self->{column_prev} = $self->{column};
7118 $self->{column}++;
7119 $self->{nc}
7120 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7121 } else {
7122 $self->{set_nc}->($self);
7123 }
7124
7125 return ($self->{ct}); # ATTLIST
7126 redo A;
7127 } elsif ($self->{nc} == -1) {
7128 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7129 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7130
7131 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7132 $self->{line_prev} = $self->{line};
7133 $self->{column_prev} = $self->{column};
7134 $self->{column}++;
7135 $self->{nc}
7136 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7137 } else {
7138 $self->{set_nc}->($self);
7139 }
7140
7141 return ($self->{ct});
7142 redo A;
7143 } else {
7144 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7145 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7146 ## Reconsume.
7147 redo A;
7148 }
7149 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7150 if ($is_space->{$self->{nc}}) {
7151 ## Stay in the state.
7152
7153 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7154 $self->{line_prev} = $self->{line};
7155 $self->{column_prev} = $self->{column};
7156 $self->{column}++;
7157 $self->{nc}
7158 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7159 } else {
7160 $self->{set_nc}->($self);
7161 }
7162
7163 redo A;
7164 } elsif ($self->{nc} == 0x0023) { # #
7165 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7166
7167 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7168 $self->{line_prev} = $self->{line};
7169 $self->{column_prev} = $self->{column};
7170 $self->{column}++;
7171 $self->{nc}
7172 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7173 } else {
7174 $self->{set_nc}->($self);
7175 }
7176
7177 redo A;
7178 } elsif ($self->{nc} == 0x0022) { # "
7179 $self->{ca}->{value} = '';
7180 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7181
7182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7183 $self->{line_prev} = $self->{line};
7184 $self->{column_prev} = $self->{column};
7185 $self->{column}++;
7186 $self->{nc}
7187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7188 } else {
7189 $self->{set_nc}->($self);
7190 }
7191
7192 redo A;
7193 } elsif ($self->{nc} == 0x0027) { # '
7194 $self->{ca}->{value} = '';
7195 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7196
7197 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7198 $self->{line_prev} = $self->{line};
7199 $self->{column_prev} = $self->{column};
7200 $self->{column}++;
7201 $self->{nc}
7202 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7203 } else {
7204 $self->{set_nc}->($self);
7205 }
7206
7207 redo A;
7208 } elsif ($self->{nc} == 0x003E) { # >
7209 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7210 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7211
7212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213 $self->{line_prev} = $self->{line};
7214 $self->{column_prev} = $self->{column};
7215 $self->{column}++;
7216 $self->{nc}
7217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218 } else {
7219 $self->{set_nc}->($self);
7220 }
7221
7222 return ($self->{ct}); # ATTLIST
7223 redo A;
7224 } elsif ($self->{nc} == -1) {
7225 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7226 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7227
7228 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229 $self->{line_prev} = $self->{line};
7230 $self->{column_prev} = $self->{column};
7231 $self->{column}++;
7232 $self->{nc}
7233 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234 } else {
7235 $self->{set_nc}->($self);
7236 }
7237
7238 return ($self->{ct});
7239 redo A;
7240 } else {
7241 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7242 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7243 ## Reconsume.
7244 redo A;
7245 }
7246 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7247 if ($is_space->{$self->{nc}}) {
7248 ## XML5: No parse error.
7249 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7250 $self->{state} = BOGUS_MD_STATE;
7251 ## Reconsume.
7252 redo A;
7253 } elsif ($self->{nc} == 0x0022) { # "
7254 ## XML5: Same as "anything else".
7255 $self->{ca}->{value} = '';
7256 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7257
7258 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7259 $self->{line_prev} = $self->{line};
7260 $self->{column_prev} = $self->{column};
7261 $self->{column}++;
7262 $self->{nc}
7263 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7264 } else {
7265 $self->{set_nc}->($self);
7266 }
7267
7268 redo A;
7269 } elsif ($self->{nc} == 0x0027) { # '
7270 ## XML5: Same as "anything else".
7271 $self->{ca}->{value} = '';
7272 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7273
7274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7275 $self->{line_prev} = $self->{line};
7276 $self->{column_prev} = $self->{column};
7277 $self->{column}++;
7278 $self->{nc}
7279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7280 } else {
7281 $self->{set_nc}->($self);
7282 }
7283
7284 redo A;
7285 } elsif ($self->{nc} == 0x003E) { # >
7286 ## XML5: Same as "anything else".
7287 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7288 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7289
7290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7291 $self->{line_prev} = $self->{line};
7292 $self->{column_prev} = $self->{column};
7293 $self->{column}++;
7294 $self->{nc}
7295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7296 } else {
7297 $self->{set_nc}->($self);
7298 }
7299
7300 return ($self->{ct}); # ATTLIST
7301 redo A;
7302 } elsif ($self->{nc} == -1) {
7303 ## XML5: No parse error.
7304 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7305 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7306
7307 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308 $self->{line_prev} = $self->{line};
7309 $self->{column_prev} = $self->{column};
7310 $self->{column}++;
7311 $self->{nc}
7312 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313 } else {
7314 $self->{set_nc}->($self);
7315 }
7316
7317 return ($self->{ct});
7318 redo A;
7319 } else {
7320 $self->{ca}->{default} = chr $self->{nc};
7321 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7322
7323 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7324 $self->{line_prev} = $self->{line};
7325 $self->{column_prev} = $self->{column};
7326 $self->{column}++;
7327 $self->{nc}
7328 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7329 } else {
7330 $self->{set_nc}->($self);
7331 }
7332
7333 redo A;
7334 }
7335 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7336 if ($is_space->{$self->{nc}}) {
7337 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7338
7339 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7340 $self->{line_prev} = $self->{line};
7341 $self->{column_prev} = $self->{column};
7342 $self->{column}++;
7343 $self->{nc}
7344 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7345 } else {
7346 $self->{set_nc}->($self);
7347 }
7348
7349 redo A;
7350 } elsif ($self->{nc} == 0x0022) { # "
7351 ## XML5: Same as "anything else".
7352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7353 $self->{ca}->{value} = '';
7354 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7355
7356 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7357 $self->{line_prev} = $self->{line};
7358 $self->{column_prev} = $self->{column};
7359 $self->{column}++;
7360 $self->{nc}
7361 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7362 } else {
7363 $self->{set_nc}->($self);
7364 }
7365
7366 redo A;
7367 } elsif ($self->{nc} == 0x0027) { # '
7368 ## XML5: Same as "anything else".
7369 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7370 $self->{ca}->{value} = '';
7371 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7372
7373 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7374 $self->{line_prev} = $self->{line};
7375 $self->{column_prev} = $self->{column};
7376 $self->{column}++;
7377 $self->{nc}
7378 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7379 } else {
7380 $self->{set_nc}->($self);
7381 }
7382
7383 redo A;
7384 } elsif ($self->{nc} == 0x003E) { # >
7385 ## XML5: Same as "anything else".
7386 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7387 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7388
7389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7390 $self->{line_prev} = $self->{line};
7391 $self->{column_prev} = $self->{column};
7392 $self->{column}++;
7393 $self->{nc}
7394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7395 } else {
7396 $self->{set_nc}->($self);
7397 }
7398
7399 return ($self->{ct}); # ATTLIST
7400 redo A;
7401 } elsif ($self->{nc} == -1) {
7402 ## XML5: No parse error.
7403 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7404 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7405 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7406
7407 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7408 $self->{line_prev} = $self->{line};
7409 $self->{column_prev} = $self->{column};
7410 $self->{column}++;
7411 $self->{nc}
7412 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7413 } else {
7414 $self->{set_nc}->($self);
7415 }
7416
7417 return ($self->{ct});
7418 redo A;
7419 } else {
7420 $self->{ca}->{default} .= chr $self->{nc};
7421 ## Stay in the state.
7422
7423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7424 $self->{line_prev} = $self->{line};
7425 $self->{column_prev} = $self->{column};
7426 $self->{column}++;
7427 $self->{nc}
7428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7429 } else {
7430 $self->{set_nc}->($self);
7431 }
7432
7433 redo A;
7434 }
7435 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7436 if ($is_space->{$self->{nc}}) {
7437 ## Stay in the state.
7438
7439 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7440 $self->{line_prev} = $self->{line};
7441 $self->{column_prev} = $self->{column};
7442 $self->{column}++;
7443 $self->{nc}
7444 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7445 } else {
7446 $self->{set_nc}->($self);
7447 }
7448
7449 redo A;
7450 } elsif ($self->{nc} == 0x0022) { # "
7451 $self->{ca}->{value} = '';
7452 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7453
7454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7455 $self->{line_prev} = $self->{line};
7456 $self->{column_prev} = $self->{column};
7457 $self->{column}++;
7458 $self->{nc}
7459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7460 } else {
7461 $self->{set_nc}->($self);
7462 }
7463
7464 redo A;
7465 } elsif ($self->{nc} == 0x0027) { # '
7466 $self->{ca}->{value} = '';
7467 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7468
7469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7470 $self->{line_prev} = $self->{line};
7471 $self->{column_prev} = $self->{column};
7472 $self->{column}++;
7473 $self->{nc}
7474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7475 } else {
7476 $self->{set_nc}->($self);
7477 }
7478
7479 redo A;
7480 } elsif ($self->{nc} == 0x003E) { # >
7481 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7482 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7483
7484 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485 $self->{line_prev} = $self->{line};
7486 $self->{column_prev} = $self->{column};
7487 $self->{column}++;
7488 $self->{nc}
7489 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490 } else {
7491 $self->{set_nc}->($self);
7492 }
7493
7494 return ($self->{ct}); # ATTLIST
7495 redo A;
7496 } elsif ($self->{nc} == -1) {
7497 ## XML5: No parse error.
7498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7499 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7500 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7501
7502 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7503 $self->{line_prev} = $self->{line};
7504 $self->{column_prev} = $self->{column};
7505 $self->{column}++;
7506 $self->{nc}
7507 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7508 } else {
7509 $self->{set_nc}->($self);
7510 }
7511
7512 return ($self->{ct});
7513 redo A;
7514 } else {
7515 ## XML5: Not defined yet.
7516 if ($self->{ca}->{default} eq 'FIXED') {
7517 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7518 } else {
7519 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7520 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7521 }
7522 ## Reconsume.
7523 redo A;
7524 }
7525 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7526 if ($is_space->{$self->{nc}} or
7527 $self->{nc} == -1 or
7528 $self->{nc} == 0x003E) { # >
7529 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7530 ## Reconsume.
7531 redo A;
7532 } else {
7533 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7534 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7535 ## Reconsume.
7536 redo A;
7537 }
7538 } elsif ($self->{state} == NDATA_STATE) {
7539 ## ASCII case-insensitive
7540 if ($self->{nc} == [
7541 undef,
7542 0x0044, # D
7543 0x0041, # A
7544 0x0054, # T
7545 ]->[length $self->{kwd}] or
7546 $self->{nc} == [
7547 undef,
7548 0x0064, # d
7549 0x0061, # a
7550 0x0074, # t
7551 ]->[length $self->{kwd}]) {
7552
7553 ## Stay in the state.
7554 $self->{kwd} .= chr $self->{nc};
7555
7556 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7557 $self->{line_prev} = $self->{line};
7558 $self->{column_prev} = $self->{column};
7559 $self->{column}++;
7560 $self->{nc}
7561 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7562 } else {
7563 $self->{set_nc}->($self);
7564 }
7565
7566 redo A;
7567 } elsif ((length $self->{kwd}) == 4 and
7568 ($self->{nc} == 0x0041 or # A
7569 $self->{nc} == 0x0061)) { # a
7570 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7571
7572 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7573 text => 'NDATA',
7574 line => $self->{line_prev},
7575 column => $self->{column_prev} - 4);
7576 } else {
7577
7578 }
7579 $self->{state} = AFTER_NDATA_STATE;
7580
7581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7582 $self->{line_prev} = $self->{line};
7583 $self->{column_prev} = $self->{column};
7584 $self->{column}++;
7585 $self->{nc}
7586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7587 } else {
7588 $self->{set_nc}->($self);
7589 }
7590
7591 redo A;
7592 } else {
7593 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7594 line => $self->{line_prev},
7595 column => $self->{column_prev} + 1
7596 - length $self->{kwd});
7597
7598 $self->{state} = BOGUS_MD_STATE;
7599 ## Reconsume.
7600 redo A;
7601 }
7602 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7603 if ($is_space->{$self->{nc}}) {
7604 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7605
7606 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7607 $self->{line_prev} = $self->{line};
7608 $self->{column_prev} = $self->{column};
7609 $self->{column}++;
7610 $self->{nc}
7611 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7612 } else {
7613 $self->{set_nc}->($self);
7614 }
7615
7616 redo A;
7617 } elsif ($self->{nc} == 0x003E) { # >
7618 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7619 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7620
7621 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7622 $self->{line_prev} = $self->{line};
7623 $self->{column_prev} = $self->{column};
7624 $self->{column}++;
7625 $self->{nc}
7626 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7627 } else {
7628 $self->{set_nc}->($self);
7629 }
7630
7631 return ($self->{ct}); # ENTITY
7632 redo A;
7633 } elsif ($self->{nc} == -1) {
7634 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7635 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7636
7637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7638 $self->{line_prev} = $self->{line};
7639 $self->{column_prev} = $self->{column};
7640 $self->{column}++;
7641 $self->{nc}
7642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7643 } else {
7644 $self->{set_nc}->($self);
7645 }
7646
7647 return ($self->{ct}); # ENTITY
7648 redo A;
7649 } else {
7650 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7651 line => $self->{line_prev},
7652 column => $self->{column_prev} + 1
7653 - length $self->{kwd});
7654 $self->{state} = BOGUS_MD_STATE;
7655 ## Reconsume.
7656 redo A;
7657 }
7658 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7659 if ($is_space->{$self->{nc}}) {
7660 ## Stay in the state.
7661
7662 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7663 $self->{line_prev} = $self->{line};
7664 $self->{column_prev} = $self->{column};
7665 $self->{column}++;
7666 $self->{nc}
7667 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7668 } else {
7669 $self->{set_nc}->($self);
7670 }
7671
7672 redo A;
7673 } elsif ($self->{nc} == 0x003E) { # >
7674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7675 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7676
7677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678 $self->{line_prev} = $self->{line};
7679 $self->{column_prev} = $self->{column};
7680 $self->{column}++;
7681 $self->{nc}
7682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683 } else {
7684 $self->{set_nc}->($self);
7685 }
7686
7687 return ($self->{ct}); # ENTITY
7688 redo A;
7689 } elsif ($self->{nc} == -1) {
7690 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7691 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7692
7693 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7694 $self->{line_prev} = $self->{line};
7695 $self->{column_prev} = $self->{column};
7696 $self->{column}++;
7697 $self->{nc}
7698 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7699 } else {
7700 $self->{set_nc}->($self);
7701 }
7702
7703 return ($self->{ct}); # ENTITY
7704 redo A;
7705 } else {
7706 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7707 $self->{state} = NOTATION_NAME_STATE;
7708
7709 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7710 $self->{line_prev} = $self->{line};
7711 $self->{column_prev} = $self->{column};
7712 $self->{column}++;
7713 $self->{nc}
7714 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7715 } else {
7716 $self->{set_nc}->($self);
7717 }
7718
7719 redo A;
7720 }
7721 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7722 if ($is_space->{$self->{nc}}) {
7723 $self->{state} = AFTER_MD_DEF_STATE;
7724
7725 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7726 $self->{line_prev} = $self->{line};
7727 $self->{column_prev} = $self->{column};
7728 $self->{column}++;
7729 $self->{nc}
7730 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7731 } else {
7732 $self->{set_nc}->($self);
7733 }
7734
7735 redo A;
7736 } elsif ($self->{nc} == 0x003E) { # >
7737 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7738
7739 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740 $self->{line_prev} = $self->{line};
7741 $self->{column_prev} = $self->{column};
7742 $self->{column}++;
7743 $self->{nc}
7744 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745 } else {
7746 $self->{set_nc}->($self);
7747 }
7748
7749 return ($self->{ct}); # ENTITY
7750 redo A;
7751 } elsif ($self->{nc} == -1) {
7752 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7753 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7754
7755 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7756 $self->{line_prev} = $self->{line};
7757 $self->{column_prev} = $self->{column};
7758 $self->{column}++;
7759 $self->{nc}
7760 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7761 } else {
7762 $self->{set_nc}->($self);
7763 }
7764
7765 return ($self->{ct}); # ENTITY
7766 redo A;
7767 } else {
7768 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7769 ## Stay in the state.
7770
7771 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772 $self->{line_prev} = $self->{line};
7773 $self->{column_prev} = $self->{column};
7774 $self->{column}++;
7775 $self->{nc}
7776 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777 } else {
7778 $self->{set_nc}->($self);
7779 }
7780
7781 redo A;
7782 }
7783 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7784 if ($self->{nc} == 0x0022) { # "
7785 $self->{state} = AFTER_MD_DEF_STATE;
7786
7787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788 $self->{line_prev} = $self->{line};
7789 $self->{column_prev} = $self->{column};
7790 $self->{column}++;
7791 $self->{nc}
7792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793 } else {
7794 $self->{set_nc}->($self);
7795 }
7796
7797 redo A;
7798 } elsif ($self->{nc} == 0x0026) { # &
7799 $self->{prev_state} = $self->{state};
7800 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7801 $self->{entity_add} = 0x0022; # "
7802
7803 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804 $self->{line_prev} = $self->{line};
7805 $self->{column_prev} = $self->{column};
7806 $self->{column}++;
7807 $self->{nc}
7808 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809 } else {
7810 $self->{set_nc}->($self);
7811 }
7812
7813 redo A;
7814 ## TODO: %
7815 } elsif ($self->{nc} == -1) {
7816 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7817 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7818 ## Reconsume.
7819 return ($self->{ct}); # ENTITY
7820 redo A;
7821 } else {
7822 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7823
7824 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7825 $self->{line_prev} = $self->{line};
7826 $self->{column_prev} = $self->{column};
7827 $self->{column}++;
7828 $self->{nc}
7829 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7830 } else {
7831 $self->{set_nc}->($self);
7832 }
7833
7834 redo A;
7835 }
7836 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7837 if ($self->{nc} == 0x0027) { # '
7838 $self->{state} = AFTER_MD_DEF_STATE;
7839
7840 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7841 $self->{line_prev} = $self->{line};
7842 $self->{column_prev} = $self->{column};
7843 $self->{column}++;
7844 $self->{nc}
7845 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7846 } else {
7847 $self->{set_nc}->($self);
7848 }
7849
7850 redo A;
7851 } elsif ($self->{nc} == 0x0026) { # &
7852 $self->{prev_state} = $self->{state};
7853 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7854 $self->{entity_add} = 0x0027; # '
7855
7856 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7857 $self->{line_prev} = $self->{line};
7858 $self->{column_prev} = $self->{column};
7859 $self->{column}++;
7860 $self->{nc}
7861 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7862 } else {
7863 $self->{set_nc}->($self);
7864 }
7865
7866 redo A;
7867 ## TODO: %
7868 } elsif ($self->{nc} == -1) {
7869 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7870 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7871 ## Reconsume.
7872 return ($self->{ct}); # ENTITY
7873 redo A;
7874 } else {
7875 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7876
7877 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7878 $self->{line_prev} = $self->{line};
7879 $self->{column_prev} = $self->{column};
7880 $self->{column}++;
7881 $self->{nc}
7882 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7883 } else {
7884 $self->{set_nc}->($self);
7885 }
7886
7887 redo A;
7888 }
7889 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7890 ## TODO: XMLize
7891
7892 if ($is_space->{$self->{nc}} or
7893 {
7894 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7895 $self->{entity_add} => 1,
7896 }->{$self->{nc}}) {
7897 ## Don't consume
7898 ## No error
7899 ## Return nothing.
7900 #
7901 } elsif ($self->{nc} == 0x0023) { # #
7902 $self->{ca} = $self->{ct};
7903 $self->{state} = ENTITY_HASH_STATE;
7904 $self->{kwd} = '#';
7905
7906 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907 $self->{line_prev} = $self->{line};
7908 $self->{column_prev} = $self->{column};
7909 $self->{column}++;
7910 $self->{nc}
7911 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912 } else {
7913 $self->{set_nc}->($self);
7914 }
7915
7916 redo A;
7917 } elsif ((0x0041 <= $self->{nc} and
7918 $self->{nc} <= 0x005A) or # A..Z
7919 (0x0061 <= $self->{nc} and
7920 $self->{nc} <= 0x007A)) { # a..z
7921 #
7922 } else {
7923 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7924 ## Return nothing.
7925 #
7926 }
7927
7928 $self->{ct}->{value} .= '&';
7929 $self->{state} = $self->{prev_state};
7930 ## Reconsume.
7931 redo A;
7932 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7933 if ($is_space->{$self->{nc}}) {
7934 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7935
7936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7937 $self->{line_prev} = $self->{line};
7938 $self->{column_prev} = $self->{column};
7939 $self->{column}++;
7940 $self->{nc}
7941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7942 } else {
7943 $self->{set_nc}->($self);
7944 }
7945
7946 redo A;
7947 } elsif ($self->{nc} == 0x0028) { # (
7948 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7949 $self->{ct}->{content} = ['('];
7950 $self->{group_depth} = 1;
7951
7952 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7953 $self->{line_prev} = $self->{line};
7954 $self->{column_prev} = $self->{column};
7955 $self->{column}++;
7956 $self->{nc}
7957 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7958 } else {
7959 $self->{set_nc}->($self);
7960 }
7961
7962 redo A;
7963 } elsif ($self->{nc} == 0x003E) { # >
7964 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7965 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7966
7967 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7968 $self->{line_prev} = $self->{line};
7969 $self->{column_prev} = $self->{column};
7970 $self->{column}++;
7971 $self->{nc}
7972 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7973 } else {
7974 $self->{set_nc}->($self);
7975 }
7976
7977 return ($self->{ct}); # ELEMENT
7978 redo A;
7979 } elsif ($self->{nc} == -1) {
7980 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7981 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7982
7983 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7984 $self->{line_prev} = $self->{line};
7985 $self->{column_prev} = $self->{column};
7986 $self->{column}++;
7987 $self->{nc}
7988 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7989 } else {
7990 $self->{set_nc}->($self);
7991 }
7992
7993 return ($self->{ct}); # ELEMENT
7994 redo A;
7995 } else {
7996 $self->{ct}->{content} = [chr $self->{nc}];
7997 $self->{state} = CONTENT_KEYWORD_STATE;
7998
7999 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8000 $self->{line_prev} = $self->{line};
8001 $self->{column_prev} = $self->{column};
8002 $self->{column}++;
8003 $self->{nc}
8004 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8005 } else {
8006 $self->{set_nc}->($self);
8007 }
8008
8009 redo A;
8010 }
8011 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8012 if ($is_space->{$self->{nc}}) {
8013 $self->{state} = AFTER_MD_DEF_STATE;
8014
8015 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8016 $self->{line_prev} = $self->{line};
8017 $self->{column_prev} = $self->{column};
8018 $self->{column}++;
8019 $self->{nc}
8020 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8021 } else {
8022 $self->{set_nc}->($self);
8023 }
8024
8025 redo A;
8026 } elsif ($self->{nc} == 0x003E) { # >
8027 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8028
8029 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8030 $self->{line_prev} = $self->{line};
8031 $self->{column_prev} = $self->{column};
8032 $self->{column}++;
8033 $self->{nc}
8034 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8035 } else {
8036 $self->{set_nc}->($self);
8037 }
8038
8039 return ($self->{ct}); # ELEMENT
8040 redo A;
8041 } elsif ($self->{nc} == -1) {
8042 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8043 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8044
8045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8046 $self->{line_prev} = $self->{line};
8047 $self->{column_prev} = $self->{column};
8048 $self->{column}++;
8049 $self->{nc}
8050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8051 } else {
8052 $self->{set_nc}->($self);
8053 }
8054
8055 return ($self->{ct}); # ELEMENT
8056 redo A;
8057 } else {
8058 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8059 ## Stay in the state.
8060
8061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8062 $self->{line_prev} = $self->{line};
8063 $self->{column_prev} = $self->{column};
8064 $self->{column}++;
8065 $self->{nc}
8066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8067 } else {
8068 $self->{set_nc}->($self);
8069 }
8070
8071 redo A;
8072 }
8073 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8074 if ($is_space->{$self->{nc}}) {
8075 ## Stay in the state.
8076
8077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8078 $self->{line_prev} = $self->{line};
8079 $self->{column_prev} = $self->{column};
8080 $self->{column}++;
8081 $self->{nc}
8082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8083 } else {
8084 $self->{set_nc}->($self);
8085 }
8086
8087 redo A;
8088 } elsif ($self->{nc} == 0x0028) { # (
8089 $self->{group_depth}++;
8090 push @{$self->{ct}->{content}}, chr $self->{nc};
8091 ## Stay in the state.
8092
8093 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094 $self->{line_prev} = $self->{line};
8095 $self->{column_prev} = $self->{column};
8096 $self->{column}++;
8097 $self->{nc}
8098 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099 } else {
8100 $self->{set_nc}->($self);
8101 }
8102
8103 redo A;
8104 } elsif ($self->{nc} == 0x007C or # |
8105 $self->{nc} == 0x002C) { # ,
8106 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8107 ## Stay in the state.
8108
8109 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8110 $self->{line_prev} = $self->{line};
8111 $self->{column_prev} = $self->{column};
8112 $self->{column}++;
8113 $self->{nc}
8114 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8115 } else {
8116 $self->{set_nc}->($self);
8117 }
8118
8119 redo A;
8120 } elsif ($self->{nc} == 0x0029) { # )
8121 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8122 push @{$self->{ct}->{content}}, chr $self->{nc};
8123 $self->{group_depth}--;
8124 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8125
8126 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127 $self->{line_prev} = $self->{line};
8128 $self->{column_prev} = $self->{column};
8129 $self->{column}++;
8130 $self->{nc}
8131 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132 } else {
8133 $self->{set_nc}->($self);
8134 }
8135
8136 redo A;
8137 } elsif ($self->{nc} == 0x003E) { # >
8138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8139 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8140 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141
8142 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8143 $self->{line_prev} = $self->{line};
8144 $self->{column_prev} = $self->{column};
8145 $self->{column}++;
8146 $self->{nc}
8147 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8148 } else {
8149 $self->{set_nc}->($self);
8150 }
8151
8152 return ($self->{ct}); # ELEMENT
8153 redo A;
8154 } elsif ($self->{nc} == -1) {
8155 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8156 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8157 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8158
8159 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8160 $self->{line_prev} = $self->{line};
8161 $self->{column_prev} = $self->{column};
8162 $self->{column}++;
8163 $self->{nc}
8164 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8165 } else {
8166 $self->{set_nc}->($self);
8167 }
8168
8169 return ($self->{ct}); # ELEMENT
8170 redo A;
8171 } else {
8172 push @{$self->{ct}->{content}}, chr $self->{nc};
8173 $self->{state} = CM_ELEMENT_NAME_STATE;
8174
8175 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8176 $self->{line_prev} = $self->{line};
8177 $self->{column_prev} = $self->{column};
8178 $self->{column}++;
8179 $self->{nc}
8180 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8181 } else {
8182 $self->{set_nc}->($self);
8183 }
8184
8185 redo A;
8186 }
8187 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8188 if ($is_space->{$self->{nc}}) {
8189 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8190
8191 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8192 $self->{line_prev} = $self->{line};
8193 $self->{column_prev} = $self->{column};
8194 $self->{column}++;
8195 $self->{nc}
8196 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8197 } else {
8198 $self->{set_nc}->($self);
8199 }
8200
8201 redo A;
8202 } elsif ($self->{nc} == 0x002A or # *
8203 $self->{nc} == 0x002B or # +
8204 $self->{nc} == 0x003F) { # ?
8205 push @{$self->{ct}->{content}}, chr $self->{nc};
8206 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8207
8208 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8209 $self->{line_prev} = $self->{line};
8210 $self->{column_prev} = $self->{column};
8211 $self->{column}++;
8212 $self->{nc}
8213 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8214 } else {
8215 $self->{set_nc}->($self);
8216 }
8217
8218 redo A;
8219 } elsif ($self->{nc} == 0x007C or # |
8220 $self->{nc} == 0x002C) { # ,
8221 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8222 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8223
8224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8225 $self->{line_prev} = $self->{line};
8226 $self->{column_prev} = $self->{column};
8227 $self->{column}++;
8228 $self->{nc}
8229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8230 } else {
8231 $self->{set_nc}->($self);
8232 }
8233
8234 redo A;
8235 } elsif ($self->{nc} == 0x0029) { # )
8236 $self->{group_depth}--;
8237 push @{$self->{ct}->{content}}, chr $self->{nc};
8238 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8239
8240 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8241 $self->{line_prev} = $self->{line};
8242 $self->{column_prev} = $self->{column};
8243 $self->{column}++;
8244 $self->{nc}
8245 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8246 } else {
8247 $self->{set_nc}->($self);
8248 }
8249
8250 redo A;
8251 } elsif ($self->{nc} == 0x003E) { # >
8252 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8253 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8254 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8255
8256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8257 $self->{line_prev} = $self->{line};
8258 $self->{column_prev} = $self->{column};
8259 $self->{column}++;
8260 $self->{nc}
8261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8262 } else {
8263 $self->{set_nc}->($self);
8264 }
8265
8266 return ($self->{ct}); # ELEMENT
8267 redo A;
8268 } elsif ($self->{nc} == -1) {
8269 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8270 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8271 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272
8273 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274 $self->{line_prev} = $self->{line};
8275 $self->{column_prev} = $self->{column};
8276 $self->{column}++;
8277 $self->{nc}
8278 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279 } else {
8280 $self->{set_nc}->($self);
8281 }
8282
8283 return ($self->{ct}); # ELEMENT
8284 redo A;
8285 } else {
8286 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8287 ## Stay in the state.
8288
8289 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8290 $self->{line_prev} = $self->{line};
8291 $self->{column_prev} = $self->{column};
8292 $self->{column}++;
8293 $self->{nc}
8294 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8295 } else {
8296 $self->{set_nc}->($self);
8297 }
8298
8299 redo A;
8300 }
8301 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8302 if ($is_space->{$self->{nc}}) {
8303 ## Stay in the state.
8304
8305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8306 $self->{line_prev} = $self->{line};
8307 $self->{column_prev} = $self->{column};
8308 $self->{column}++;
8309 $self->{nc}
8310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8311 } else {
8312 $self->{set_nc}->($self);
8313 }
8314
8315 redo A;
8316 } elsif ($self->{nc} == 0x007C or # |
8317 $self->{nc} == 0x002C) { # ,
8318 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8319 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8320
8321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322 $self->{line_prev} = $self->{line};
8323 $self->{column_prev} = $self->{column};
8324 $self->{column}++;
8325 $self->{nc}
8326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327 } else {
8328 $self->{set_nc}->($self);
8329 }
8330
8331 redo A;
8332 } elsif ($self->{nc} == 0x0029) { # )
8333 $self->{group_depth}--;
8334 push @{$self->{ct}->{content}}, chr $self->{nc};
8335 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8336
8337 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338 $self->{line_prev} = $self->{line};
8339 $self->{column_prev} = $self->{column};
8340 $self->{column}++;
8341 $self->{nc}
8342 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343 } else {
8344 $self->{set_nc}->($self);
8345 }
8346
8347 redo A;
8348 } elsif ($self->{nc} == 0x003E) { # >
8349 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8350 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8351 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8352
8353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8354 $self->{line_prev} = $self->{line};
8355 $self->{column_prev} = $self->{column};
8356 $self->{column}++;
8357 $self->{nc}
8358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8359 } else {
8360 $self->{set_nc}->($self);
8361 }
8362
8363 return ($self->{ct}); # ELEMENT
8364 redo A;
8365 } elsif ($self->{nc} == -1) {
8366 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8367 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369
8370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371 $self->{line_prev} = $self->{line};
8372 $self->{column_prev} = $self->{column};
8373 $self->{column}++;
8374 $self->{nc}
8375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376 } else {
8377 $self->{set_nc}->($self);
8378 }
8379
8380 return ($self->{ct}); # ELEMENT
8381 redo A;
8382 } else {
8383 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8384 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385 $self->{state} = BOGUS_MD_STATE;
8386
8387 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388 $self->{line_prev} = $self->{line};
8389 $self->{column_prev} = $self->{column};
8390 $self->{column}++;
8391 $self->{nc}
8392 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393 } else {
8394 $self->{set_nc}->($self);
8395 }
8396
8397 redo A;
8398 }
8399 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8400 if ($is_space->{$self->{nc}}) {
8401 if ($self->{group_depth}) {
8402 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8403 } else {
8404 $self->{state} = AFTER_MD_DEF_STATE;
8405 }
8406
8407 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8408 $self->{line_prev} = $self->{line};
8409 $self->{column_prev} = $self->{column};
8410 $self->{column}++;
8411 $self->{nc}
8412 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8413 } else {
8414 $self->{set_nc}->($self);
8415 }
8416
8417 redo A;
8418 } elsif ($self->{nc} == 0x002A or # *
8419 $self->{nc} == 0x002B or # +
8420 $self->{nc} == 0x003F) { # ?
8421 push @{$self->{ct}->{content}}, chr $self->{nc};
8422 if ($self->{group_depth}) {
8423 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8424 } else {
8425 $self->{state} = AFTER_MD_DEF_STATE;
8426 }
8427
8428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8429 $self->{line_prev} = $self->{line};
8430 $self->{column_prev} = $self->{column};
8431 $self->{column}++;
8432 $self->{nc}
8433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8434 } else {
8435 $self->{set_nc}->($self);
8436 }
8437
8438 redo A;
8439 } elsif ($self->{nc} == 0x0029) { # )
8440 if ($self->{group_depth}) {
8441 $self->{group_depth}--;
8442 push @{$self->{ct}->{content}}, chr $self->{nc};
8443 ## Stay in the state.
8444
8445 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8446 $self->{line_prev} = $self->{line};
8447 $self->{column_prev} = $self->{column};
8448 $self->{column}++;
8449 $self->{nc}
8450 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8451 } else {
8452 $self->{set_nc}->($self);
8453 }
8454
8455 redo A;
8456 } else {
8457 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8458 $self->{state} = BOGUS_MD_STATE;
8459 ## Reconsume.
8460 redo A;
8461 }
8462 } elsif ($self->{nc} == 0x003E) { # >
8463 if ($self->{group_depth}) {
8464 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8465 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8466 }
8467 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8468
8469 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8470 $self->{line_prev} = $self->{line};
8471 $self->{column_prev} = $self->{column};
8472 $self->{column}++;
8473 $self->{nc}
8474 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8475 } else {
8476 $self->{set_nc}->($self);
8477 }
8478
8479 return ($self->{ct}); # ELEMENT
8480 redo A;
8481 } elsif ($self->{nc} == -1) {
8482 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8483 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8484 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485
8486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487 $self->{line_prev} = $self->{line};
8488 $self->{column_prev} = $self->{column};
8489 $self->{column}++;
8490 $self->{nc}
8491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492 } else {
8493 $self->{set_nc}->($self);
8494 }
8495
8496 return ($self->{ct}); # ELEMENT
8497 redo A;
8498 } else {
8499 if ($self->{group_depth}) {
8500 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8501 } else {
8502 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8503 $self->{state} = BOGUS_MD_STATE;
8504 }
8505 ## Reconsume.
8506 redo A;
8507 }
8508 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8509 if ($is_space->{$self->{nc}}) {
8510 ## Stay in the state.
8511
8512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8513 $self->{line_prev} = $self->{line};
8514 $self->{column_prev} = $self->{column};
8515 $self->{column}++;
8516 $self->{nc}
8517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8518 } else {
8519 $self->{set_nc}->($self);
8520 }
8521
8522 redo A;
8523 } elsif ($self->{nc} == 0x003E) { # >
8524 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8525
8526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8527 $self->{line_prev} = $self->{line};
8528 $self->{column_prev} = $self->{column};
8529 $self->{column}++;
8530 $self->{nc}
8531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8532 } else {
8533 $self->{set_nc}->($self);
8534 }
8535
8536 return ($self->{ct}); # ENTITY/ELEMENT
8537 redo A;
8538 } elsif ($self->{nc} == -1) {
8539 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8540 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8541
8542 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8543 $self->{line_prev} = $self->{line};
8544 $self->{column_prev} = $self->{column};
8545 $self->{column}++;
8546 $self->{nc}
8547 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8548 } else {
8549 $self->{set_nc}->($self);
8550 }
8551
8552 return ($self->{ct}); # ENTITY/ELEMENT
8553 redo A;
8554 } else {
8555 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8556 $self->{state} = BOGUS_MD_STATE;
8557 ## Reconsume.
8558 redo A;
8559 }
8560 } elsif ($self->{state} == BOGUS_MD_STATE) {
8561 if ($self->{nc} == 0x003E) { # >
8562 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8563
8564 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8565 $self->{line_prev} = $self->{line};
8566 $self->{column_prev} = $self->{column};
8567 $self->{column}++;
8568 $self->{nc}
8569 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8570 } else {
8571 $self->{set_nc}->($self);
8572 }
8573
8574 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8575 redo A;
8576 } elsif ($self->{nc} == -1) {
8577 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8578 ## Reconsume.
8579 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8580 redo A;
8581 } else {
8582 ## Stay in the state.
8583
8584 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8585 $self->{line_prev} = $self->{line};
8586 $self->{column_prev} = $self->{column};
8587 $self->{column}++;
8588 $self->{nc}
8589 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8590 } else {
8591 $self->{set_nc}->($self);
8592 }
8593
8594 redo A;
8595 }
8596 } else {
8597 die "$0: $self->{state}: Unknown state";
8598 }
8599 } # A
8600
8601 die "$0: _get_next_token: unexpected case";
8602 } # _get_next_token
8603
8604 1;
8605 ## $Date: 2008/10/19 08:20:29 $
8606

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24