/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.33 - (show annotations) (download)
Sat Sep 5 10:41:07 2009 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.32: +52 -17 lines
++ whatpm/t/ChangeLog	5 Sep 2009 10:40:03 -0000
	* tokenizer-test-1.test: Updated test results on unclosed start
	and end tags (HTML5 revision 2990).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	5 Sep 2009 10:40:48 -0000
2009-09-05  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat, attrs-1.dat: Updated test results on unclosed
	tags and attlist declarations (cf. HTML5 revision 2990).

++ whatpm/Whatpm/HTML/ChangeLog	5 Sep 2009 10:39:09 -0000
	* Tokenizer.pm.src: Discard unclosed tags (HTML5 revision 2990).

2009-09-05  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.32 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 BEGIN {
6 require Exporter;
7 push our @ISA, 'Exporter';
8
9 our @EXPORT_OK = qw(
10 DOCTYPE_TOKEN
11 COMMENT_TOKEN
12 START_TAG_TOKEN
13 END_TAG_TOKEN
14 END_OF_FILE_TOKEN
15 CHARACTER_TOKEN
16 PI_TOKEN
17 ABORT_TOKEN
18 END_OF_DOCTYPE_TOKEN
19 ATTLIST_TOKEN
20 ELEMENT_TOKEN
21 GENERAL_ENTITY_TOKEN
22 PARAMETER_ENTITY_TOKEN
23 NOTATION_TOKEN
24 );
25
26 our %EXPORT_TAGS = (
27 token => [qw(
28 DOCTYPE_TOKEN
29 COMMENT_TOKEN
30 START_TAG_TOKEN
31 END_TAG_TOKEN
32 END_OF_FILE_TOKEN
33 CHARACTER_TOKEN
34 PI_TOKEN
35 ABORT_TOKEN
36 END_OF_DOCTYPE_TOKEN
37 ATTLIST_TOKEN
38 ELEMENT_TOKEN
39 GENERAL_ENTITY_TOKEN
40 PARAMETER_ENTITY_TOKEN
41 NOTATION_TOKEN
42 )],
43 );
44 }
45
46 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47
48 ## Token types
49
50 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51 sub COMMENT_TOKEN () { 2 }
52 sub START_TAG_TOKEN () { 3 }
53 sub END_TAG_TOKEN () { 4 }
54 sub END_OF_FILE_TOKEN () { 5 }
55 sub CHARACTER_TOKEN () { 6 }
56 sub PI_TOKEN () { 7 } ## NOTE: XML only.
57 sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58 sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59 sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60 sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61 sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62 sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63 sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64
65 ## XML5: XML5 has "empty tag token". In this implementation, it is
66 ## represented as a start tag token with $self->{self_closing} flag
67 ## set to true.
68
69 ## XML5: XML5 has "short end tag token". In this implementation, it
70 ## is represented as an end tag token with $token->{tag_name} flag set
71 ## to an empty string.
72
73 package Whatpm::HTML;
74
75 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76
77 ## Content model flags
78
79 sub CM_ENTITY () { 0b001 } # & markup in data
80 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
81 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
82
83 sub PLAINTEXT_CONTENT_MODEL () { 0 }
84 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
85 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
86 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
87
88 ## Tokenizer states
89
90 sub DATA_STATE () { 0 }
91 #sub ENTITY_DATA_STATE () { 1 }
92 sub TAG_OPEN_STATE () { 2 }
93 sub CLOSE_TAG_OPEN_STATE () { 3 }
94 sub TAG_NAME_STATE () { 4 }
95 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
96 sub ATTRIBUTE_NAME_STATE () { 6 }
97 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
98 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
99 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
100 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
101 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
102 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
103 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
104 sub COMMENT_START_STATE () { 14 }
105 sub COMMENT_START_DASH_STATE () { 15 }
106 sub COMMENT_STATE () { 16 }
107 sub COMMENT_END_STATE () { 17 }
108 sub COMMENT_END_BANG_STATE () { 102 }
109 sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110 sub COMMENT_END_DASH_STATE () { 18 }
111 sub BOGUS_COMMENT_STATE () { 19 }
112 sub DOCTYPE_STATE () { 20 }
113 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
114 sub DOCTYPE_NAME_STATE () { 22 }
115 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
116 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
117 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
118 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
119 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
120 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
121 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
122 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
123 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
124 sub BOGUS_DOCTYPE_STATE () { 32 }
125 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
126 sub SELF_CLOSING_START_TAG_STATE () { 34 }
127 sub CDATA_SECTION_STATE () { 35 }
128 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
129 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
130 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
131 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
132 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
133 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
134 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
135 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
136 ## NOTE: "Entity data state", "entity in attribute value state", and
137 ## "consume a character reference" algorithm are jointly implemented
138 ## using the following six states:
139 sub ENTITY_STATE () { 44 }
140 sub ENTITY_HASH_STATE () { 45 }
141 sub NCR_NUM_STATE () { 46 }
142 sub HEXREF_X_STATE () { 47 }
143 sub HEXREF_HEX_STATE () { 48 }
144 sub ENTITY_NAME_STATE () { 49 }
145 sub PCDATA_STATE () { 50 } # "data state" in the spec
146
147 ## XML-only states
148 sub PI_STATE () { 51 }
149 sub PI_TARGET_STATE () { 52 }
150 sub PI_TARGET_AFTER_STATE () { 53 }
151 sub PI_DATA_STATE () { 54 }
152 sub PI_AFTER_STATE () { 55 }
153 sub PI_DATA_AFTER_STATE () { 56 }
154 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155 sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156 sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157 sub DOCTYPE_TAG_STATE () { 60 }
158 sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159 sub MD_ATTLIST_STATE () { 62 }
160 sub MD_E_STATE () { 63 }
161 sub MD_ELEMENT_STATE () { 64 }
162 sub MD_ENTITY_STATE () { 65 }
163 sub MD_NOTATION_STATE () { 66 }
164 sub DOCTYPE_MD_STATE () { 67 }
165 sub BEFORE_MD_NAME_STATE () { 68 }
166 sub MD_NAME_STATE () { 69 }
167 sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168 sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170 sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172 sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173 sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174 sub ALLOWED_TOKEN_STATE () { 77 }
175 sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176 sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177 sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180 sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181 sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182 sub BEFORE_NDATA_STATE () { 85 }
183 sub NDATA_STATE () { 86 }
184 sub AFTER_NDATA_STATE () { 87 }
185 sub BEFORE_NOTATION_NAME_STATE () { 88 }
186 sub NOTATION_NAME_STATE () { 89 }
187 sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188 sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189 sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190 sub AFTER_ELEMENT_NAME_STATE () { 93 }
191 sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192 sub CONTENT_KEYWORD_STATE () { 95 }
193 sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194 sub CM_ELEMENT_NAME_STATE () { 97 }
195 sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196 sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197 sub AFTER_MD_DEF_STATE () { 100 }
198 sub BOGUS_MD_STATE () { 101 }
199
200 ## Tree constructor state constants (see Whatpm::HTML for the full
201 ## list and descriptions)
202
203 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
204 sub FOREIGN_EL () { 0b1_00000000000 }
205
206 ## Character reference mappings
207
208 my $charref_map = {
209 0x0D => 0x000A,
210 0x80 => 0x20AC,
211 0x81 => 0xFFFD,
212 0x82 => 0x201A,
213 0x83 => 0x0192,
214 0x84 => 0x201E,
215 0x85 => 0x2026,
216 0x86 => 0x2020,
217 0x87 => 0x2021,
218 0x88 => 0x02C6,
219 0x89 => 0x2030,
220 0x8A => 0x0160,
221 0x8B => 0x2039,
222 0x8C => 0x0152,
223 0x8D => 0xFFFD,
224 0x8E => 0x017D,
225 0x8F => 0xFFFD,
226 0x90 => 0xFFFD,
227 0x91 => 0x2018,
228 0x92 => 0x2019,
229 0x93 => 0x201C,
230 0x94 => 0x201D,
231 0x95 => 0x2022,
232 0x96 => 0x2013,
233 0x97 => 0x2014,
234 0x98 => 0x02DC,
235 0x99 => 0x2122,
236 0x9A => 0x0161,
237 0x9B => 0x203A,
238 0x9C => 0x0153,
239 0x9D => 0xFFFD,
240 0x9E => 0x017E,
241 0x9F => 0x0178,
242 }; # $charref_map
243 $charref_map->{$_} = 0xFFFD
244 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
245 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
246 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
247 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
248 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
249 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
250 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
251
252 ## Implementations MUST act as if state machine in the spec
253
254 sub _initialize_tokenizer ($) {
255 my $self = shift;
256
257 ## NOTE: Fields set by |new| constructor:
258 #$self->{level}
259 #$self->{set_nc}
260 #$self->{parse_error}
261 #$self->{is_xml} (if XML)
262
263 $self->{state} = DATA_STATE; # MUST
264 $self->{s_kwd} = ''; # Data state keyword
265 #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266 #$self->{entity__value}; # initialized when used
267 #$self->{entity__match}; # initialized when used
268 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
269 undef $self->{ct}; # current token
270 undef $self->{ca}; # current attribute
271 undef $self->{last_stag_name}; # last emitted start tag name
272 #$self->{prev_state}; # initialized when used
273 delete $self->{self_closing};
274 $self->{char_buffer} = '';
275 $self->{char_buffer_pos} = 0;
276 $self->{nc} = -1; # next input character
277 #$self->{next_nc}
278
279 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
280 $self->{line_prev} = $self->{line};
281 $self->{column_prev} = $self->{column};
282 $self->{column}++;
283 $self->{nc}
284 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
285 } else {
286 $self->{set_nc}->($self);
287 }
288
289 $self->{token} = [];
290 # $self->{escape}
291 } # _initialize_tokenizer
292
293 ## A token has:
294 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
295 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
296 ## ->{name} (DOCTYPE_TOKEN)
297 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
298 ## ->{target} (PI_TOKEN)
299 ## ->{pubid} (DOCTYPE_TOKEN)
300 ## ->{sysid} (DOCTYPE_TOKEN)
301 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
302 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
303 ## ->{name}
304 ## ->{value}
305 ## ->{has_reference} == 1 or 0
306 ## ->{index}: Index of the attribute in a tag.
307 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
308 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
309 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
310 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
311
312 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
313 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
314 ## while the token is pushed back to the stack.
315
316 ## Emitted token MUST immediately be handled by the tree construction state.
317
318 ## Before each step, UA MAY check to see if either one of the scripts in
319 ## "list of scripts that will execute as soon as possible" or the first
320 ## script in the "list of scripts that will execute asynchronously",
321 ## has completed loading. If one has, then it MUST be executed
322 ## and removed from the list.
323
324 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
325 ## (This requirement was dropped from HTML5 spec, unfortunately.)
326
327 my $is_space = {
328 0x0009 => 1, # CHARACTER TABULATION (HT)
329 0x000A => 1, # LINE FEED (LF)
330 #0x000B => 0, # LINE TABULATION (VT)
331 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
332 #0x000D => 1, # CARRIAGE RETURN (CR)
333 0x0020 => 1, # SPACE (SP)
334 };
335
336 sub _get_next_token ($) {
337 my $self = shift;
338
339 if ($self->{self_closing}) {
340 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
341 ## NOTE: The |self_closing| flag is only set by start tag token.
342 ## In addition, when a start tag token is emitted, it is always set to
343 ## |ct|.
344 delete $self->{self_closing};
345 }
346
347 if (@{$self->{token}}) {
348 $self->{self_closing} = $self->{token}->[0]->{self_closing};
349 return shift @{$self->{token}};
350 }
351
352 A: {
353 if ($self->{state} == PCDATA_STATE) {
354 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
355
356 if ($self->{nc} == 0x0026) { # &
357
358 ## NOTE: In the spec, the tokenizer is switched to the
359 ## "entity data state". In this implementation, the tokenizer
360 ## is switched to the |ENTITY_STATE|, which is an implementation
361 ## of the "consume a character reference" algorithm.
362 $self->{entity_add} = -1;
363 $self->{prev_state} = DATA_STATE;
364 $self->{state} = ENTITY_STATE;
365
366 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
367 $self->{line_prev} = $self->{line};
368 $self->{column_prev} = $self->{column};
369 $self->{column}++;
370 $self->{nc}
371 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
372 } else {
373 $self->{set_nc}->($self);
374 }
375
376 redo A;
377 } elsif ($self->{nc} == 0x003C) { # <
378
379 $self->{state} = TAG_OPEN_STATE;
380
381 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
382 $self->{line_prev} = $self->{line};
383 $self->{column_prev} = $self->{column};
384 $self->{column}++;
385 $self->{nc}
386 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
387 } else {
388 $self->{set_nc}->($self);
389 }
390
391 redo A;
392 } elsif ($self->{nc} == -1) {
393
394 return ({type => END_OF_FILE_TOKEN,
395 line => $self->{line}, column => $self->{column}});
396 last A; ## TODO: ok?
397 } else {
398
399 #
400 }
401
402 # Anything else
403 my $token = {type => CHARACTER_TOKEN,
404 data => chr $self->{nc},
405 line => $self->{line}, column => $self->{column},
406 };
407 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
408
409 ## Stay in the state.
410
411 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
412 $self->{line_prev} = $self->{line};
413 $self->{column_prev} = $self->{column};
414 $self->{column}++;
415 $self->{nc}
416 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
417 } else {
418 $self->{set_nc}->($self);
419 }
420
421 return ($token);
422 redo A;
423 } elsif ($self->{state} == DATA_STATE) {
424 $self->{s_kwd} = '' unless defined $self->{s_kwd};
425 if ($self->{nc} == 0x0026) { # &
426 $self->{s_kwd} = '';
427 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
428 not $self->{escape}) {
429
430 ## NOTE: In the spec, the tokenizer is switched to the
431 ## "entity data state". In this implementation, the tokenizer
432 ## is switched to the |ENTITY_STATE|, which is an implementation
433 ## of the "consume a character reference" algorithm.
434 $self->{entity_add} = -1;
435 $self->{prev_state} = DATA_STATE;
436 $self->{state} = ENTITY_STATE;
437
438 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
439 $self->{line_prev} = $self->{line};
440 $self->{column_prev} = $self->{column};
441 $self->{column}++;
442 $self->{nc}
443 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
444 } else {
445 $self->{set_nc}->($self);
446 }
447
448 redo A;
449 } else {
450
451 #
452 }
453 } elsif ($self->{nc} == 0x002D) { # -
454 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
455 if ($self->{s_kwd} eq '<!-') {
456
457 $self->{escape} = 1; # unless $self->{escape};
458 $self->{s_kwd} = '--';
459 #
460 } elsif ($self->{s_kwd} eq '-') {
461
462 $self->{s_kwd} = '--';
463 #
464 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
465
466 $self->{s_kwd} .= '-';
467 #
468 } else {
469
470 $self->{s_kwd} = '-';
471 #
472 }
473 }
474
475 #
476 } elsif ($self->{nc} == 0x0021) { # !
477 if (length $self->{s_kwd}) {
478
479 $self->{s_kwd} .= '!';
480 #
481 } else {
482
483 #$self->{s_kwd} = '';
484 #
485 }
486 #
487 } elsif ($self->{nc} == 0x003C) { # <
488 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
489 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
490 not $self->{escape})) {
491
492 $self->{state} = TAG_OPEN_STATE;
493
494 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
495 $self->{line_prev} = $self->{line};
496 $self->{column_prev} = $self->{column};
497 $self->{column}++;
498 $self->{nc}
499 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
500 } else {
501 $self->{set_nc}->($self);
502 }
503
504 redo A;
505 } else {
506
507 $self->{s_kwd} = '';
508 #
509 }
510 } elsif ($self->{nc} == 0x003E) { # >
511 if ($self->{escape} and
512 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
513 if ($self->{s_kwd} eq '--') {
514
515 delete $self->{escape};
516 #
517 } else {
518
519 #
520 }
521 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
522
523 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
524 line => $self->{line_prev},
525 column => $self->{column_prev} - 1);
526 #
527 } else {
528
529 #
530 }
531
532 $self->{s_kwd} = '';
533 #
534 } elsif ($self->{nc} == 0x005D) { # ]
535 if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
536
537 $self->{s_kwd} .= ']';
538 } elsif ($self->{s_kwd} eq ']]') {
539
540 #
541 } else {
542
543 $self->{s_kwd} = '';
544 }
545 #
546 } elsif ($self->{nc} == -1) {
547
548 $self->{s_kwd} = '';
549 return ({type => END_OF_FILE_TOKEN,
550 line => $self->{line}, column => $self->{column}});
551 last A; ## TODO: ok?
552 } else {
553
554 $self->{s_kwd} = '';
555 #
556 }
557
558 # Anything else
559 my $token = {type => CHARACTER_TOKEN,
560 data => chr $self->{nc},
561 line => $self->{line}, column => $self->{column},
562 };
563 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
564 length $token->{data})) {
565 $self->{s_kwd} = '';
566 }
567
568 ## Stay in the data state.
569 if (not $self->{is_xml} and
570 $self->{content_model} == PCDATA_CONTENT_MODEL) {
571
572 $self->{state} = PCDATA_STATE;
573 } else {
574
575 ## Stay in the state.
576 }
577
578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
579 $self->{line_prev} = $self->{line};
580 $self->{column_prev} = $self->{column};
581 $self->{column}++;
582 $self->{nc}
583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
584 } else {
585 $self->{set_nc}->($self);
586 }
587
588 return ($token);
589 redo A;
590 } elsif ($self->{state} == TAG_OPEN_STATE) {
591 ## XML5: "tag state".
592
593 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
594 if ($self->{nc} == 0x002F) { # /
595
596
597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
598 $self->{line_prev} = $self->{line};
599 $self->{column_prev} = $self->{column};
600 $self->{column}++;
601 $self->{nc}
602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
603 } else {
604 $self->{set_nc}->($self);
605 }
606
607 $self->{state} = CLOSE_TAG_OPEN_STATE;
608 redo A;
609 } elsif ($self->{nc} == 0x0021) { # !
610
611 $self->{s_kwd} = $self->{escaped} ? '' : '<';
612 #
613 } else {
614
615 $self->{s_kwd} = '';
616 #
617 }
618
619 ## reconsume
620 $self->{state} = DATA_STATE;
621 return ({type => CHARACTER_TOKEN, data => '<',
622 line => $self->{line_prev},
623 column => $self->{column_prev},
624 });
625 redo A;
626 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
627 if ($self->{nc} == 0x0021) { # !
628
629 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
630
631 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
632 $self->{line_prev} = $self->{line};
633 $self->{column_prev} = $self->{column};
634 $self->{column}++;
635 $self->{nc}
636 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
637 } else {
638 $self->{set_nc}->($self);
639 }
640
641 redo A;
642 } elsif ($self->{nc} == 0x002F) { # /
643
644 $self->{state} = CLOSE_TAG_OPEN_STATE;
645
646 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
647 $self->{line_prev} = $self->{line};
648 $self->{column_prev} = $self->{column};
649 $self->{column}++;
650 $self->{nc}
651 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
652 } else {
653 $self->{set_nc}->($self);
654 }
655
656 redo A;
657 } elsif (0x0041 <= $self->{nc} and
658 $self->{nc} <= 0x005A) { # A..Z
659
660 $self->{ct}
661 = {type => START_TAG_TOKEN,
662 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
663 line => $self->{line_prev},
664 column => $self->{column_prev}};
665 $self->{state} = TAG_NAME_STATE;
666
667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668 $self->{line_prev} = $self->{line};
669 $self->{column_prev} = $self->{column};
670 $self->{column}++;
671 $self->{nc}
672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673 } else {
674 $self->{set_nc}->($self);
675 }
676
677 redo A;
678 } elsif (0x0061 <= $self->{nc} and
679 $self->{nc} <= 0x007A) { # a..z
680
681 $self->{ct} = {type => START_TAG_TOKEN,
682 tag_name => chr ($self->{nc}),
683 line => $self->{line_prev},
684 column => $self->{column_prev}};
685 $self->{state} = TAG_NAME_STATE;
686
687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
688 $self->{line_prev} = $self->{line};
689 $self->{column_prev} = $self->{column};
690 $self->{column}++;
691 $self->{nc}
692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
693 } else {
694 $self->{set_nc}->($self);
695 }
696
697 redo A;
698 } elsif ($self->{nc} == 0x003E) { # >
699
700 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
701 line => $self->{line_prev},
702 column => $self->{column_prev});
703 $self->{state} = DATA_STATE;
704 $self->{s_kwd} = '';
705
706 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
707 $self->{line_prev} = $self->{line};
708 $self->{column_prev} = $self->{column};
709 $self->{column}++;
710 $self->{nc}
711 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
712 } else {
713 $self->{set_nc}->($self);
714 }
715
716
717 return ({type => CHARACTER_TOKEN, data => '<>',
718 line => $self->{line_prev},
719 column => $self->{column_prev},
720 });
721
722 redo A;
723 } elsif ($self->{nc} == 0x003F) { # ?
724 if ($self->{is_xml}) {
725
726 $self->{state} = PI_STATE;
727
728 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
729 $self->{line_prev} = $self->{line};
730 $self->{column_prev} = $self->{column};
731 $self->{column}++;
732 $self->{nc}
733 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
734 } else {
735 $self->{set_nc}->($self);
736 }
737
738 redo A;
739 } else {
740
741 $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
742 line => $self->{line_prev},
743 column => $self->{column_prev});
744 $self->{state} = BOGUS_COMMENT_STATE;
745 $self->{ct} = {type => COMMENT_TOKEN, data => '',
746 line => $self->{line_prev},
747 column => $self->{column_prev},
748 };
749 ## $self->{nc} is intentionally left as is
750 redo A;
751 }
752 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
753
754 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
755 line => $self->{line_prev},
756 column => $self->{column_prev});
757 $self->{state} = DATA_STATE;
758 $self->{s_kwd} = '';
759 ## reconsume
760
761 return ({type => CHARACTER_TOKEN, data => '<',
762 line => $self->{line_prev},
763 column => $self->{column_prev},
764 });
765
766 redo A;
767 } else {
768 ## XML5: "<:" is a parse error.
769
770 $self->{ct} = {type => START_TAG_TOKEN,
771 tag_name => chr ($self->{nc}),
772 line => $self->{line_prev},
773 column => $self->{column_prev}};
774 $self->{state} = TAG_NAME_STATE;
775
776 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
777 $self->{line_prev} = $self->{line};
778 $self->{column_prev} = $self->{column};
779 $self->{column}++;
780 $self->{nc}
781 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
782 } else {
783 $self->{set_nc}->($self);
784 }
785
786 redo A;
787 }
788 } else {
789 die "$0: $self->{content_model} in tag open";
790 }
791 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
792 ## NOTE: The "close tag open state" in the spec is implemented as
793 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
794
795 ## XML5: "end tag state".
796
797 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
798 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
799 if (defined $self->{last_stag_name}) {
800 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
801 $self->{kwd} = '';
802 ## Reconsume.
803 redo A;
804 } else {
805 ## No start tag token has ever been emitted
806 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
807
808 $self->{state} = DATA_STATE;
809 $self->{s_kwd} = '';
810 ## Reconsume.
811 return ({type => CHARACTER_TOKEN, data => '</',
812 line => $l, column => $c,
813 });
814 redo A;
815 }
816 }
817
818 if (0x0041 <= $self->{nc} and
819 $self->{nc} <= 0x005A) { # A..Z
820
821 $self->{ct}
822 = {type => END_TAG_TOKEN,
823 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
824 line => $l, column => $c};
825 $self->{state} = TAG_NAME_STATE;
826
827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
828 $self->{line_prev} = $self->{line};
829 $self->{column_prev} = $self->{column};
830 $self->{column}++;
831 $self->{nc}
832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
833 } else {
834 $self->{set_nc}->($self);
835 }
836
837 redo A;
838 } elsif (0x0061 <= $self->{nc} and
839 $self->{nc} <= 0x007A) { # a..z
840
841 $self->{ct} = {type => END_TAG_TOKEN,
842 tag_name => chr ($self->{nc}),
843 line => $l, column => $c};
844 $self->{state} = TAG_NAME_STATE;
845
846 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
847 $self->{line_prev} = $self->{line};
848 $self->{column_prev} = $self->{column};
849 $self->{column}++;
850 $self->{nc}
851 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
852 } else {
853 $self->{set_nc}->($self);
854 }
855
856 redo A;
857 } elsif ($self->{nc} == 0x003E) { # >
858 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
859 line => $self->{line_prev}, ## "<" in "</>"
860 column => $self->{column_prev} - 1);
861 $self->{state} = DATA_STATE;
862 $self->{s_kwd} = '';
863 if ($self->{is_xml}) {
864
865 ## XML5: No parse error.
866
867 ## NOTE: This parser raises a parse error, since it supports
868 ## XML1, not XML5.
869
870 ## NOTE: A short end tag token.
871 my $ct = {type => END_TAG_TOKEN,
872 tag_name => '',
873 line => $self->{line_prev},
874 column => $self->{column_prev} - 1,
875 };
876
877 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
878 $self->{line_prev} = $self->{line};
879 $self->{column_prev} = $self->{column};
880 $self->{column}++;
881 $self->{nc}
882 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
883 } else {
884 $self->{set_nc}->($self);
885 }
886
887 return ($ct);
888 } else {
889
890
891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
892 $self->{line_prev} = $self->{line};
893 $self->{column_prev} = $self->{column};
894 $self->{column}++;
895 $self->{nc}
896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
897 } else {
898 $self->{set_nc}->($self);
899 }
900
901 }
902 redo A;
903 } elsif ($self->{nc} == -1) {
904
905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
906 $self->{s_kwd} = '';
907 $self->{state} = DATA_STATE;
908 # reconsume
909
910 return ({type => CHARACTER_TOKEN, data => '</',
911 line => $l, column => $c,
912 });
913
914 redo A;
915 } elsif (not $self->{is_xml} or
916 $is_space->{$self->{nc}}) {
917
918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
919 line => $self->{line_prev}, # "<" of "</"
920 column => $self->{column_prev} - 1);
921 $self->{state} = BOGUS_COMMENT_STATE;
922 $self->{ct} = {type => COMMENT_TOKEN, data => '',
923 line => $self->{line_prev}, # "<" of "</"
924 column => $self->{column_prev} - 1,
925 };
926 ## NOTE: $self->{nc} is intentionally left as is.
927 ## Although the "anything else" case of the spec not explicitly
928 ## states that the next input character is to be reconsumed,
929 ## it will be included to the |data| of the comment token
930 ## generated from the bogus end tag, as defined in the
931 ## "bogus comment state" entry.
932 redo A;
933 } else {
934 ## XML5: "</:" is a parse error.
935
936 $self->{ct} = {type => END_TAG_TOKEN,
937 tag_name => chr ($self->{nc}),
938 line => $l, column => $c};
939 $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
940
941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
942 $self->{line_prev} = $self->{line};
943 $self->{column_prev} = $self->{column};
944 $self->{column}++;
945 $self->{nc}
946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
947 } else {
948 $self->{set_nc}->($self);
949 }
950
951 redo A;
952 }
953 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
954 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
955 if (length $ch) {
956 my $CH = $ch;
957 $ch =~ tr/a-z/A-Z/;
958 my $nch = chr $self->{nc};
959 if ($nch eq $ch or $nch eq $CH) {
960
961 ## Stay in the state.
962 $self->{kwd} .= $nch;
963
964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
965 $self->{line_prev} = $self->{line};
966 $self->{column_prev} = $self->{column};
967 $self->{column}++;
968 $self->{nc}
969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
970 } else {
971 $self->{set_nc}->($self);
972 }
973
974 redo A;
975 } else {
976
977 $self->{state} = DATA_STATE;
978 $self->{s_kwd} = '';
979 ## Reconsume.
980 return ({type => CHARACTER_TOKEN,
981 data => '</' . $self->{kwd},
982 line => $self->{line_prev},
983 column => $self->{column_prev} - 1 - length $self->{kwd},
984 });
985 redo A;
986 }
987 } else { # after "<{tag-name}"
988 unless ($is_space->{$self->{nc}} or
989 {
990 0x003E => 1, # >
991 0x002F => 1, # /
992 -1 => 1, # EOF
993 }->{$self->{nc}}) {
994
995 ## Reconsume.
996 $self->{state} = DATA_STATE;
997 $self->{s_kwd} = '';
998 return ({type => CHARACTER_TOKEN,
999 data => '</' . $self->{kwd},
1000 line => $self->{line_prev},
1001 column => $self->{column_prev} - 1 - length $self->{kwd},
1002 });
1003 redo A;
1004 } else {
1005
1006 $self->{ct}
1007 = {type => END_TAG_TOKEN,
1008 tag_name => $self->{last_stag_name},
1009 line => $self->{line_prev},
1010 column => $self->{column_prev} - 1 - length $self->{kwd}};
1011 $self->{state} = TAG_NAME_STATE;
1012 ## Reconsume.
1013 redo A;
1014 }
1015 }
1016 } elsif ($self->{state} == TAG_NAME_STATE) {
1017 if ($is_space->{$self->{nc}}) {
1018
1019 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1020
1021 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1022 $self->{line_prev} = $self->{line};
1023 $self->{column_prev} = $self->{column};
1024 $self->{column}++;
1025 $self->{nc}
1026 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1027 } else {
1028 $self->{set_nc}->($self);
1029 }
1030
1031 redo A;
1032 } elsif ($self->{nc} == 0x003E) { # >
1033 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1034
1035 $self->{last_stag_name} = $self->{ct}->{tag_name};
1036 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1037 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1038 #if ($self->{ct}->{attributes}) {
1039 # ## NOTE: This should never be reached.
1040 # !!! cp (36);
1041 # !!! parse-error (type => 'end tag attribute');
1042 #} else {
1043
1044 #}
1045 } else {
1046 die "$0: $self->{ct}->{type}: Unknown token type";
1047 }
1048 $self->{state} = DATA_STATE;
1049 $self->{s_kwd} = '';
1050
1051 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052 $self->{line_prev} = $self->{line};
1053 $self->{column_prev} = $self->{column};
1054 $self->{column}++;
1055 $self->{nc}
1056 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1057 } else {
1058 $self->{set_nc}->($self);
1059 }
1060
1061
1062 return ($self->{ct}); # start tag or end tag
1063
1064 redo A;
1065 } elsif (0x0041 <= $self->{nc} and
1066 $self->{nc} <= 0x005A) { # A..Z
1067
1068 $self->{ct}->{tag_name}
1069 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1070 # start tag or end tag
1071 ## Stay in this state
1072
1073 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1074 $self->{line_prev} = $self->{line};
1075 $self->{column_prev} = $self->{column};
1076 $self->{column}++;
1077 $self->{nc}
1078 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1079 } else {
1080 $self->{set_nc}->($self);
1081 }
1082
1083 redo A;
1084 } elsif ($self->{nc} == -1) {
1085 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1086 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1087
1088 $self->{last_stag_name} = $self->{ct}->{tag_name};
1089 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1090 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091 #if ($self->{ct}->{attributes}) {
1092 # ## NOTE: This state should never be reached.
1093 # !!! cp (40);
1094 # !!! parse-error (type => 'end tag attribute');
1095 #} else {
1096
1097 #}
1098 } else {
1099 die "$0: $self->{ct}->{type}: Unknown token type";
1100 }
1101 $self->{state} = DATA_STATE;
1102 $self->{s_kwd} = '';
1103 # reconsume
1104
1105 ## Discard the token.
1106 #return ($self->{ct}); # start tag or end tag
1107
1108 redo A;
1109 } elsif ($self->{nc} == 0x002F) { # /
1110
1111 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1112
1113 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1114 $self->{line_prev} = $self->{line};
1115 $self->{column_prev} = $self->{column};
1116 $self->{column}++;
1117 $self->{nc}
1118 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1119 } else {
1120 $self->{set_nc}->($self);
1121 }
1122
1123 redo A;
1124 } else {
1125
1126 $self->{ct}->{tag_name} .= chr $self->{nc};
1127 # start tag or end tag
1128 ## Stay in the state
1129
1130 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1131 $self->{line_prev} = $self->{line};
1132 $self->{column_prev} = $self->{column};
1133 $self->{column}++;
1134 $self->{nc}
1135 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1136 } else {
1137 $self->{set_nc}->($self);
1138 }
1139
1140 redo A;
1141 }
1142 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1143 ## XML5: "Tag attribute name before state".
1144
1145 if ($is_space->{$self->{nc}}) {
1146
1147 ## Stay in the state
1148
1149 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1150 $self->{line_prev} = $self->{line};
1151 $self->{column_prev} = $self->{column};
1152 $self->{column}++;
1153 $self->{nc}
1154 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1155 } else {
1156 $self->{set_nc}->($self);
1157 }
1158
1159 redo A;
1160 } elsif ($self->{nc} == 0x003E) { # >
1161 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1162
1163 $self->{last_stag_name} = $self->{ct}->{tag_name};
1164 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1165 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1166 if ($self->{ct}->{attributes}) {
1167
1168 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1169 } else {
1170
1171 }
1172 } else {
1173 die "$0: $self->{ct}->{type}: Unknown token type";
1174 }
1175 $self->{state} = DATA_STATE;
1176 $self->{s_kwd} = '';
1177
1178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1179 $self->{line_prev} = $self->{line};
1180 $self->{column_prev} = $self->{column};
1181 $self->{column}++;
1182 $self->{nc}
1183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1184 } else {
1185 $self->{set_nc}->($self);
1186 }
1187
1188
1189 return ($self->{ct}); # start tag or end tag
1190
1191 redo A;
1192 } elsif (0x0041 <= $self->{nc} and
1193 $self->{nc} <= 0x005A) { # A..Z
1194
1195 $self->{ca}
1196 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1197 value => '',
1198 line => $self->{line}, column => $self->{column}};
1199 $self->{state} = ATTRIBUTE_NAME_STATE;
1200
1201 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1202 $self->{line_prev} = $self->{line};
1203 $self->{column_prev} = $self->{column};
1204 $self->{column}++;
1205 $self->{nc}
1206 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1207 } else {
1208 $self->{set_nc}->($self);
1209 }
1210
1211 redo A;
1212 } elsif ($self->{nc} == 0x002F) { # /
1213
1214 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1215
1216 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1217 $self->{line_prev} = $self->{line};
1218 $self->{column_prev} = $self->{column};
1219 $self->{column}++;
1220 $self->{nc}
1221 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1222 } else {
1223 $self->{set_nc}->($self);
1224 }
1225
1226 redo A;
1227 } elsif ($self->{nc} == -1) {
1228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1229 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1230
1231 $self->{last_stag_name} = $self->{ct}->{tag_name};
1232 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1233 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1234 if ($self->{ct}->{attributes}) {
1235
1236 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1237 } else {
1238
1239 }
1240 } else {
1241 die "$0: $self->{ct}->{type}: Unknown token type";
1242 }
1243 $self->{state} = DATA_STATE;
1244 $self->{s_kwd} = '';
1245 # reconsume
1246
1247 ## Discard the token.
1248 #return ($self->{ct}); # start tag or end tag
1249
1250 redo A;
1251 } else {
1252 if ({
1253 0x0022 => 1, # "
1254 0x0027 => 1, # '
1255 0x003C => 1, # <
1256 0x003D => 1, # =
1257 }->{$self->{nc}}) {
1258
1259 ## XML5: Not a parse error.
1260 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1261 } else {
1262
1263 ## XML5: ":" raises a parse error and is ignored.
1264 }
1265 $self->{ca}
1266 = {name => chr ($self->{nc}),
1267 value => '',
1268 line => $self->{line}, column => $self->{column}};
1269 $self->{state} = ATTRIBUTE_NAME_STATE;
1270
1271 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1272 $self->{line_prev} = $self->{line};
1273 $self->{column_prev} = $self->{column};
1274 $self->{column}++;
1275 $self->{nc}
1276 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1277 } else {
1278 $self->{set_nc}->($self);
1279 }
1280
1281 redo A;
1282 }
1283 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1284 ## XML5: "Tag attribute name state".
1285
1286 my $before_leave = sub {
1287 if (exists $self->{ct}->{attributes} # start tag or end tag
1288 ->{$self->{ca}->{name}}) { # MUST
1289
1290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1291 ## Discard $self->{ca} # MUST
1292 } else {
1293
1294 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1295 = $self->{ca};
1296 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1297 }
1298 }; # $before_leave
1299
1300 if ($is_space->{$self->{nc}}) {
1301
1302 $before_leave->();
1303 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1304
1305 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1306 $self->{line_prev} = $self->{line};
1307 $self->{column_prev} = $self->{column};
1308 $self->{column}++;
1309 $self->{nc}
1310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1311 } else {
1312 $self->{set_nc}->($self);
1313 }
1314
1315 redo A;
1316 } elsif ($self->{nc} == 0x003D) { # =
1317
1318 $before_leave->();
1319 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1320
1321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1322 $self->{line_prev} = $self->{line};
1323 $self->{column_prev} = $self->{column};
1324 $self->{column}++;
1325 $self->{nc}
1326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1327 } else {
1328 $self->{set_nc}->($self);
1329 }
1330
1331 redo A;
1332 } elsif ($self->{nc} == 0x003E) { # >
1333 if ($self->{is_xml}) {
1334
1335 ## XML5: Not a parse error.
1336 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1337 } else {
1338
1339 }
1340
1341 $before_leave->();
1342 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1343
1344 $self->{last_stag_name} = $self->{ct}->{tag_name};
1345 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1346
1347 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1348 if ($self->{ct}->{attributes}) {
1349 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1350 }
1351 } else {
1352 die "$0: $self->{ct}->{type}: Unknown token type";
1353 }
1354 $self->{state} = DATA_STATE;
1355 $self->{s_kwd} = '';
1356
1357 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1358 $self->{line_prev} = $self->{line};
1359 $self->{column_prev} = $self->{column};
1360 $self->{column}++;
1361 $self->{nc}
1362 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1363 } else {
1364 $self->{set_nc}->($self);
1365 }
1366
1367
1368 return ($self->{ct}); # start tag or end tag
1369
1370 redo A;
1371 } elsif (0x0041 <= $self->{nc} and
1372 $self->{nc} <= 0x005A) { # A..Z
1373
1374 $self->{ca}->{name}
1375 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1376 ## Stay in the state
1377
1378 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1379 $self->{line_prev} = $self->{line};
1380 $self->{column_prev} = $self->{column};
1381 $self->{column}++;
1382 $self->{nc}
1383 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1384 } else {
1385 $self->{set_nc}->($self);
1386 }
1387
1388 redo A;
1389 } elsif ($self->{nc} == 0x002F) { # /
1390 if ($self->{is_xml}) {
1391
1392 ## XML5: Not a parse error.
1393 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1394 } else {
1395
1396 }
1397
1398 $before_leave->();
1399 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1400
1401 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1402 $self->{line_prev} = $self->{line};
1403 $self->{column_prev} = $self->{column};
1404 $self->{column}++;
1405 $self->{nc}
1406 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1407 } else {
1408 $self->{set_nc}->($self);
1409 }
1410
1411 redo A;
1412 } elsif ($self->{nc} == -1) {
1413 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1414 $before_leave->();
1415 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1416
1417 $self->{last_stag_name} = $self->{ct}->{tag_name};
1418 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1419 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1420 if ($self->{ct}->{attributes}) {
1421
1422 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1423 } else {
1424 ## NOTE: This state should never be reached.
1425
1426 }
1427 } else {
1428 die "$0: $self->{ct}->{type}: Unknown token type";
1429 }
1430 $self->{state} = DATA_STATE;
1431 $self->{s_kwd} = '';
1432 # reconsume
1433
1434 ## Discard the token.
1435 #return ($self->{ct}); # start tag or end tag
1436
1437 redo A;
1438 } else {
1439 if ({
1440 0x0022 => 1, # "
1441 0x0027 => 1, # '
1442 0x003C => 1, # <
1443 }->{$self->{nc}}) {
1444
1445 ## XML5: Not a parse error.
1446 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1447 } else {
1448
1449 }
1450 $self->{ca}->{name} .= chr ($self->{nc});
1451 ## Stay in the state
1452
1453 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1454 $self->{line_prev} = $self->{line};
1455 $self->{column_prev} = $self->{column};
1456 $self->{column}++;
1457 $self->{nc}
1458 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1459 } else {
1460 $self->{set_nc}->($self);
1461 }
1462
1463 redo A;
1464 }
1465 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1466 ## XML5: "Tag attribute name after state".
1467
1468 if ($is_space->{$self->{nc}}) {
1469
1470 ## Stay in the state
1471
1472 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1473 $self->{line_prev} = $self->{line};
1474 $self->{column_prev} = $self->{column};
1475 $self->{column}++;
1476 $self->{nc}
1477 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1478 } else {
1479 $self->{set_nc}->($self);
1480 }
1481
1482 redo A;
1483 } elsif ($self->{nc} == 0x003D) { # =
1484
1485 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1486
1487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1488 $self->{line_prev} = $self->{line};
1489 $self->{column_prev} = $self->{column};
1490 $self->{column}++;
1491 $self->{nc}
1492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1493 } else {
1494 $self->{set_nc}->($self);
1495 }
1496
1497 redo A;
1498 } elsif ($self->{nc} == 0x003E) { # >
1499 if ($self->{is_xml}) {
1500
1501 ## XML5: Not a parse error.
1502 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1503 } else {
1504
1505 }
1506
1507 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1508
1509 $self->{last_stag_name} = $self->{ct}->{tag_name};
1510 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1511 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1512 if ($self->{ct}->{attributes}) {
1513
1514 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1515 } else {
1516 ## NOTE: This state should never be reached.
1517
1518 }
1519 } else {
1520 die "$0: $self->{ct}->{type}: Unknown token type";
1521 }
1522 $self->{state} = DATA_STATE;
1523 $self->{s_kwd} = '';
1524
1525 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1526 $self->{line_prev} = $self->{line};
1527 $self->{column_prev} = $self->{column};
1528 $self->{column}++;
1529 $self->{nc}
1530 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1531 } else {
1532 $self->{set_nc}->($self);
1533 }
1534
1535
1536 return ($self->{ct}); # start tag or end tag
1537
1538 redo A;
1539 } elsif (0x0041 <= $self->{nc} and
1540 $self->{nc} <= 0x005A) { # A..Z
1541
1542 $self->{ca}
1543 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1544 value => '',
1545 line => $self->{line}, column => $self->{column}};
1546 $self->{state} = ATTRIBUTE_NAME_STATE;
1547
1548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1549 $self->{line_prev} = $self->{line};
1550 $self->{column_prev} = $self->{column};
1551 $self->{column}++;
1552 $self->{nc}
1553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1554 } else {
1555 $self->{set_nc}->($self);
1556 }
1557
1558 redo A;
1559 } elsif ($self->{nc} == 0x002F) { # /
1560 if ($self->{is_xml}) {
1561
1562 ## XML5: Not a parse error.
1563 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1564 } else {
1565
1566 }
1567
1568 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1569
1570 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1571 $self->{line_prev} = $self->{line};
1572 $self->{column_prev} = $self->{column};
1573 $self->{column}++;
1574 $self->{nc}
1575 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1576 } else {
1577 $self->{set_nc}->($self);
1578 }
1579
1580 redo A;
1581 } elsif ($self->{nc} == -1) {
1582 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1583 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1584
1585 $self->{last_stag_name} = $self->{ct}->{tag_name};
1586 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1587 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1588 if ($self->{ct}->{attributes}) {
1589
1590 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1591 } else {
1592 ## NOTE: This state should never be reached.
1593
1594 }
1595 } else {
1596 die "$0: $self->{ct}->{type}: Unknown token type";
1597 }
1598 $self->{s_kwd} = '';
1599 $self->{state} = DATA_STATE;
1600 # reconsume
1601
1602 ## Discard the token.
1603 #return ($self->{ct}); # start tag or end tag
1604
1605 redo A;
1606 } else {
1607 if ($self->{is_xml}) {
1608
1609 ## XML5: Not a parse error.
1610 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1611 } else {
1612
1613 }
1614
1615 if ({
1616 0x0022 => 1, # "
1617 0x0027 => 1, # '
1618 0x003C => 1, # <
1619 }->{$self->{nc}}) {
1620
1621 ## XML5: Not a parse error.
1622 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1623 } else {
1624
1625 }
1626 $self->{ca}
1627 = {name => chr ($self->{nc}),
1628 value => '',
1629 line => $self->{line}, column => $self->{column}};
1630 $self->{state} = ATTRIBUTE_NAME_STATE;
1631
1632 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1633 $self->{line_prev} = $self->{line};
1634 $self->{column_prev} = $self->{column};
1635 $self->{column}++;
1636 $self->{nc}
1637 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1638 } else {
1639 $self->{set_nc}->($self);
1640 }
1641
1642 redo A;
1643 }
1644 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1645 ## XML5: "Tag attribute value before state".
1646
1647 if ($is_space->{$self->{nc}}) {
1648
1649 ## Stay in the state
1650
1651 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1652 $self->{line_prev} = $self->{line};
1653 $self->{column_prev} = $self->{column};
1654 $self->{column}++;
1655 $self->{nc}
1656 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1657 } else {
1658 $self->{set_nc}->($self);
1659 }
1660
1661 redo A;
1662 } elsif ($self->{nc} == 0x0022) { # "
1663
1664 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1665
1666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1667 $self->{line_prev} = $self->{line};
1668 $self->{column_prev} = $self->{column};
1669 $self->{column}++;
1670 $self->{nc}
1671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1672 } else {
1673 $self->{set_nc}->($self);
1674 }
1675
1676 redo A;
1677 } elsif ($self->{nc} == 0x0026) { # &
1678
1679 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1680 ## reconsume
1681 redo A;
1682 } elsif ($self->{nc} == 0x0027) { # '
1683
1684 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1685
1686 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1687 $self->{line_prev} = $self->{line};
1688 $self->{column_prev} = $self->{column};
1689 $self->{column}++;
1690 $self->{nc}
1691 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1692 } else {
1693 $self->{set_nc}->($self);
1694 }
1695
1696 redo A;
1697 } elsif ($self->{nc} == 0x003E) { # >
1698 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1699 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1700
1701 $self->{last_stag_name} = $self->{ct}->{tag_name};
1702 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1703 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1704 if ($self->{ct}->{attributes}) {
1705
1706 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1707 } else {
1708 ## NOTE: This state should never be reached.
1709
1710 }
1711 } else {
1712 die "$0: $self->{ct}->{type}: Unknown token type";
1713 }
1714 $self->{state} = DATA_STATE;
1715 $self->{s_kwd} = '';
1716
1717 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1718 $self->{line_prev} = $self->{line};
1719 $self->{column_prev} = $self->{column};
1720 $self->{column}++;
1721 $self->{nc}
1722 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1723 } else {
1724 $self->{set_nc}->($self);
1725 }
1726
1727
1728 return ($self->{ct}); # start tag or end tag
1729
1730 redo A;
1731 } elsif ($self->{nc} == -1) {
1732 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1733 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1734
1735 $self->{last_stag_name} = $self->{ct}->{tag_name};
1736 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1737 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1738 if ($self->{ct}->{attributes}) {
1739
1740 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1741 } else {
1742 ## NOTE: This state should never be reached.
1743
1744 }
1745 } else {
1746 die "$0: $self->{ct}->{type}: Unknown token type";
1747 }
1748 $self->{state} = DATA_STATE;
1749 $self->{s_kwd} = '';
1750 ## reconsume
1751
1752 ## Discard the token.
1753 #return ($self->{ct}); # start tag or end tag
1754
1755 redo A;
1756 } else {
1757 if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1758
1759 ## XML5: Not a parse error.
1760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1761 } elsif ($self->{is_xml}) {
1762
1763 ## XML5: No parse error.
1764 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1765 } else {
1766
1767 }
1768 $self->{ca}->{value} .= chr ($self->{nc});
1769 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1770
1771 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1772 $self->{line_prev} = $self->{line};
1773 $self->{column_prev} = $self->{column};
1774 $self->{column}++;
1775 $self->{nc}
1776 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1777 } else {
1778 $self->{set_nc}->($self);
1779 }
1780
1781 redo A;
1782 }
1783 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1784 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1785 ## ATTLIST attribute value double quoted state".
1786
1787 if ($self->{nc} == 0x0022) { # "
1788 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1789
1790 ## XML5: "DOCTYPE ATTLIST name after state".
1791 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1792 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1793 } else {
1794
1795 ## XML5: "Tag attribute name before state".
1796 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1797 }
1798
1799 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1800 $self->{line_prev} = $self->{line};
1801 $self->{column_prev} = $self->{column};
1802 $self->{column}++;
1803 $self->{nc}
1804 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1805 } else {
1806 $self->{set_nc}->($self);
1807 }
1808
1809 redo A;
1810 } elsif ($self->{nc} == 0x0026) { # &
1811
1812 ## XML5: Not defined yet.
1813
1814 ## NOTE: In the spec, the tokenizer is switched to the
1815 ## "entity in attribute value state". In this implementation, the
1816 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1817 ## implementation of the "consume a character reference" algorithm.
1818 $self->{prev_state} = $self->{state};
1819 $self->{entity_add} = 0x0022; # "
1820 $self->{state} = ENTITY_STATE;
1821
1822 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1823 $self->{line_prev} = $self->{line};
1824 $self->{column_prev} = $self->{column};
1825 $self->{column}++;
1826 $self->{nc}
1827 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1828 } else {
1829 $self->{set_nc}->($self);
1830 }
1831
1832 redo A;
1833 } elsif ($self->{is_xml} and
1834 $is_space->{$self->{nc}}) {
1835
1836 $self->{ca}->{value} .= ' ';
1837 ## Stay in the state.
1838
1839 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1840 $self->{line_prev} = $self->{line};
1841 $self->{column_prev} = $self->{column};
1842 $self->{column}++;
1843 $self->{nc}
1844 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1845 } else {
1846 $self->{set_nc}->($self);
1847 }
1848
1849 redo A;
1850 } elsif ($self->{nc} == -1) {
1851 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1852 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1853
1854 $self->{last_stag_name} = $self->{ct}->{tag_name};
1855
1856 $self->{state} = DATA_STATE;
1857 $self->{s_kwd} = '';
1858 ## reconsume
1859 return ($self->{ct}); # start tag
1860 redo A;
1861 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1862 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1863 if ($self->{ct}->{attributes}) {
1864
1865 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1866 } else {
1867 ## NOTE: This state should never be reached.
1868
1869 }
1870
1871 $self->{state} = DATA_STATE;
1872 $self->{s_kwd} = '';
1873 ## reconsume
1874
1875 ## Discard the token.
1876 #return ($self->{ct}); # end tag
1877
1878 redo A;
1879 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1880 ## XML5: No parse error above; not defined yet.
1881 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1883 ## Reconsume.
1884
1885 ## Discard the token.
1886 #return ($self->{ct}); # ATTLIST
1887
1888 redo A;
1889 } else {
1890 die "$0: $self->{ct}->{type}: Unknown token type";
1891 }
1892 } else {
1893 ## XML5 [ATTLIST]: Not defined yet.
1894 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1895
1896 ## XML5: Not a parse error.
1897 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1898 } else {
1899
1900 }
1901 $self->{ca}->{value} .= chr ($self->{nc});
1902 $self->{read_until}->($self->{ca}->{value},
1903 qq["&<\x09\x0C\x20],
1904 length $self->{ca}->{value});
1905
1906 ## Stay in the state
1907
1908 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1909 $self->{line_prev} = $self->{line};
1910 $self->{column_prev} = $self->{column};
1911 $self->{column}++;
1912 $self->{nc}
1913 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1914 } else {
1915 $self->{set_nc}->($self);
1916 }
1917
1918 redo A;
1919 }
1920 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1921 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1922 ## ATTLIST attribute value single quoted state".
1923
1924 if ($self->{nc} == 0x0027) { # '
1925 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1926
1927 ## XML5: "DOCTYPE ATTLIST name after state".
1928 push @{$self->{ct}->{attrdefs}}, $self->{ca};
1929 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1930 } else {
1931
1932 ## XML5: "Before attribute name state" (sic).
1933 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1934 }
1935
1936 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1937 $self->{line_prev} = $self->{line};
1938 $self->{column_prev} = $self->{column};
1939 $self->{column}++;
1940 $self->{nc}
1941 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1942 } else {
1943 $self->{set_nc}->($self);
1944 }
1945
1946 redo A;
1947 } elsif ($self->{nc} == 0x0026) { # &
1948
1949 ## XML5: Not defined yet.
1950
1951 ## NOTE: In the spec, the tokenizer is switched to the
1952 ## "entity in attribute value state". In this implementation, the
1953 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1954 ## implementation of the "consume a character reference" algorithm.
1955 $self->{entity_add} = 0x0027; # '
1956 $self->{prev_state} = $self->{state};
1957 $self->{state} = ENTITY_STATE;
1958
1959 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1960 $self->{line_prev} = $self->{line};
1961 $self->{column_prev} = $self->{column};
1962 $self->{column}++;
1963 $self->{nc}
1964 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1965 } else {
1966 $self->{set_nc}->($self);
1967 }
1968
1969 redo A;
1970 } elsif ($self->{is_xml} and
1971 $is_space->{$self->{nc}}) {
1972
1973 $self->{ca}->{value} .= ' ';
1974 ## Stay in the state.
1975
1976 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1977 $self->{line_prev} = $self->{line};
1978 $self->{column_prev} = $self->{column};
1979 $self->{column}++;
1980 $self->{nc}
1981 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1982 } else {
1983 $self->{set_nc}->($self);
1984 }
1985
1986 redo A;
1987 } elsif ($self->{nc} == -1) {
1988 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1989 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1990
1991 $self->{last_stag_name} = $self->{ct}->{tag_name};
1992
1993 $self->{state} = DATA_STATE;
1994 $self->{s_kwd} = '';
1995 ## reconsume
1996
1997 ## Discard the token.
1998 #return ($self->{ct}); # start tag
1999
2000 redo A;
2001 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2002 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2003 if ($self->{ct}->{attributes}) {
2004
2005 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2006 } else {
2007 ## NOTE: This state should never be reached.
2008
2009 }
2010
2011 $self->{state} = DATA_STATE;
2012 $self->{s_kwd} = '';
2013 ## reconsume
2014
2015 ## Discard the token.
2016 #return ($self->{ct}); # end tag
2017
2018 redo A;
2019 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2020 ## XML5: No parse error above; not defined yet.
2021 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2022 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2023 ## Reconsume.
2024
2025 ## Discard the token.
2026 #return ($self->{ct}); # ATTLIST
2027
2028 redo A;
2029 } else {
2030 die "$0: $self->{ct}->{type}: Unknown token type";
2031 }
2032 } else {
2033 ## XML5 [ATTLIST]: Not defined yet.
2034 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2035
2036 ## XML5: Not a parse error.
2037 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2038 } else {
2039
2040 }
2041 $self->{ca}->{value} .= chr ($self->{nc});
2042 $self->{read_until}->($self->{ca}->{value},
2043 qq['&<\x09\x0C\x20],
2044 length $self->{ca}->{value});
2045
2046 ## Stay in the state
2047
2048 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2049 $self->{line_prev} = $self->{line};
2050 $self->{column_prev} = $self->{column};
2051 $self->{column}++;
2052 $self->{nc}
2053 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2054 } else {
2055 $self->{set_nc}->($self);
2056 }
2057
2058 redo A;
2059 }
2060 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2061 ## XML5: "Tag attribute value unquoted state".
2062
2063 if ($is_space->{$self->{nc}}) {
2064 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2065
2066 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2067 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2068 } else {
2069
2070 ## XML5: "Tag attribute name before state".
2071 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2072 }
2073
2074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2075 $self->{line_prev} = $self->{line};
2076 $self->{column_prev} = $self->{column};
2077 $self->{column}++;
2078 $self->{nc}
2079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2080 } else {
2081 $self->{set_nc}->($self);
2082 }
2083
2084 redo A;
2085 } elsif ($self->{nc} == 0x0026) { # &
2086
2087
2088 ## XML5: Not defined yet.
2089
2090 ## NOTE: In the spec, the tokenizer is switched to the
2091 ## "entity in attribute value state". In this implementation, the
2092 ## tokenizer is switched to the |ENTITY_STATE|, which is an
2093 ## implementation of the "consume a character reference" algorithm.
2094 $self->{entity_add} = -1;
2095 $self->{prev_state} = $self->{state};
2096 $self->{state} = ENTITY_STATE;
2097
2098 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2099 $self->{line_prev} = $self->{line};
2100 $self->{column_prev} = $self->{column};
2101 $self->{column}++;
2102 $self->{nc}
2103 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2104 } else {
2105 $self->{set_nc}->($self);
2106 }
2107
2108 redo A;
2109 } elsif ($self->{nc} == 0x003E) { # >
2110 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2111
2112 $self->{last_stag_name} = $self->{ct}->{tag_name};
2113
2114 $self->{state} = DATA_STATE;
2115 $self->{s_kwd} = '';
2116
2117 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2118 $self->{line_prev} = $self->{line};
2119 $self->{column_prev} = $self->{column};
2120 $self->{column}++;
2121 $self->{nc}
2122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2123 } else {
2124 $self->{set_nc}->($self);
2125 }
2126
2127 return ($self->{ct}); # start tag
2128 redo A;
2129 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2130 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2131 if ($self->{ct}->{attributes}) {
2132
2133 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2134 } else {
2135 ## NOTE: This state should never be reached.
2136
2137 }
2138
2139 $self->{state} = DATA_STATE;
2140 $self->{s_kwd} = '';
2141
2142 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2143 $self->{line_prev} = $self->{line};
2144 $self->{column_prev} = $self->{column};
2145 $self->{column}++;
2146 $self->{nc}
2147 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2148 } else {
2149 $self->{set_nc}->($self);
2150 }
2151
2152 return ($self->{ct}); # end tag
2153 redo A;
2154 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2155 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2156 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2157
2158 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159 $self->{line_prev} = $self->{line};
2160 $self->{column_prev} = $self->{column};
2161 $self->{column}++;
2162 $self->{nc}
2163 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164 } else {
2165 $self->{set_nc}->($self);
2166 }
2167
2168 return ($self->{ct}); # ATTLIST
2169 redo A;
2170 } else {
2171 die "$0: $self->{ct}->{type}: Unknown token type";
2172 }
2173 } elsif ($self->{nc} == -1) {
2174 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2175
2176 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2177 $self->{last_stag_name} = $self->{ct}->{tag_name};
2178
2179 $self->{state} = DATA_STATE;
2180 $self->{s_kwd} = '';
2181 ## reconsume
2182
2183 ## Discard the token.
2184 #return ($self->{ct}); # start tag
2185
2186 redo A;
2187 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2188 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2189 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2190 if ($self->{ct}->{attributes}) {
2191
2192 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2193 } else {
2194 ## NOTE: This state should never be reached.
2195
2196 }
2197
2198 $self->{state} = DATA_STATE;
2199 $self->{s_kwd} = '';
2200 ## reconsume
2201
2202 ## Discard the token.
2203 #return ($self->{ct}); # end tag
2204
2205 redo A;
2206 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2207 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2208 push @{$self->{ct}->{attrdefs}}, $self->{ca};
2209 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2210 ## Reconsume.
2211
2212 ## Discard the token.
2213 #return ($self->{ct}); # ATTLIST
2214
2215 redo A;
2216 } else {
2217 die "$0: $self->{ct}->{type}: Unknown token type";
2218 }
2219 } else {
2220 if ({
2221 0x0022 => 1, # "
2222 0x0027 => 1, # '
2223 0x003D => 1, # =
2224 0x003C => 1, # <
2225 }->{$self->{nc}}) {
2226
2227 ## XML5: Not a parse error.
2228 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2229 } else {
2230
2231 }
2232 $self->{ca}->{value} .= chr ($self->{nc});
2233 $self->{read_until}->($self->{ca}->{value},
2234 qq["'=& \x09\x0C>],
2235 length $self->{ca}->{value});
2236
2237 ## Stay in the state
2238
2239 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2240 $self->{line_prev} = $self->{line};
2241 $self->{column_prev} = $self->{column};
2242 $self->{column}++;
2243 $self->{nc}
2244 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2245 } else {
2246 $self->{set_nc}->($self);
2247 }
2248
2249 redo A;
2250 }
2251 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2252 if ($is_space->{$self->{nc}}) {
2253
2254 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2255
2256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2257 $self->{line_prev} = $self->{line};
2258 $self->{column_prev} = $self->{column};
2259 $self->{column}++;
2260 $self->{nc}
2261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2262 } else {
2263 $self->{set_nc}->($self);
2264 }
2265
2266 redo A;
2267 } elsif ($self->{nc} == 0x003E) { # >
2268 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2269
2270 $self->{last_stag_name} = $self->{ct}->{tag_name};
2271 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2272 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2273 if ($self->{ct}->{attributes}) {
2274
2275 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2276 } else {
2277 ## NOTE: This state should never be reached.
2278
2279 }
2280 } else {
2281 die "$0: $self->{ct}->{type}: Unknown token type";
2282 }
2283 $self->{state} = DATA_STATE;
2284 $self->{s_kwd} = '';
2285
2286 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2287 $self->{line_prev} = $self->{line};
2288 $self->{column_prev} = $self->{column};
2289 $self->{column}++;
2290 $self->{nc}
2291 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2292 } else {
2293 $self->{set_nc}->($self);
2294 }
2295
2296
2297 return ($self->{ct}); # start tag or end tag
2298
2299 redo A;
2300 } elsif ($self->{nc} == 0x002F) { # /
2301
2302 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2303
2304 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2305 $self->{line_prev} = $self->{line};
2306 $self->{column_prev} = $self->{column};
2307 $self->{column}++;
2308 $self->{nc}
2309 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2310 } else {
2311 $self->{set_nc}->($self);
2312 }
2313
2314 redo A;
2315 } elsif ($self->{nc} == -1) {
2316 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2317 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2318
2319 $self->{last_stag_name} = $self->{ct}->{tag_name};
2320 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2321 if ($self->{ct}->{attributes}) {
2322
2323 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2324 } else {
2325 ## NOTE: This state should never be reached.
2326
2327 }
2328 } else {
2329 die "$0: $self->{ct}->{type}: Unknown token type";
2330 }
2331 $self->{state} = DATA_STATE;
2332 $self->{s_kwd} = '';
2333 ## Reconsume.
2334
2335 ## Discard the token.
2336 #return ($self->{ct}); # start tag or end tag
2337
2338 redo A;
2339 } else {
2340
2341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2342 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2343 ## reconsume
2344 redo A;
2345 }
2346 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2347 ## XML5: "Empty tag state".
2348
2349 if ($self->{nc} == 0x003E) { # >
2350 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2351
2352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2353 ## TODO: Different type than slash in start tag
2354 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2355 if ($self->{ct}->{attributes}) {
2356
2357 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2358 } else {
2359
2360 }
2361 ## TODO: Test |<title></title/>|
2362 } else {
2363
2364 $self->{self_closing} = 1;
2365 }
2366
2367 $self->{state} = DATA_STATE;
2368 $self->{s_kwd} = '';
2369
2370 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2371 $self->{line_prev} = $self->{line};
2372 $self->{column_prev} = $self->{column};
2373 $self->{column}++;
2374 $self->{nc}
2375 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2376 } else {
2377 $self->{set_nc}->($self);
2378 }
2379
2380
2381 return ($self->{ct}); # start tag or end tag
2382
2383 redo A;
2384 } elsif ($self->{nc} == -1) {
2385 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2386 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2387
2388 $self->{last_stag_name} = $self->{ct}->{tag_name};
2389 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2390 if ($self->{ct}->{attributes}) {
2391
2392 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2393 } else {
2394 ## NOTE: This state should never be reached.
2395
2396 }
2397 } else {
2398 die "$0: $self->{ct}->{type}: Unknown token type";
2399 }
2400 ## XML5: "Tag attribute name before state".
2401 $self->{state} = DATA_STATE;
2402 $self->{s_kwd} = '';
2403 ## Reconsume.
2404
2405 ## Discard the token.
2406 #return ($self->{ct}); # start tag or end tag
2407
2408 redo A;
2409 } else {
2410
2411 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2412 ## TODO: This error type is wrong.
2413 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2414 ## Reconsume.
2415 redo A;
2416 }
2417 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2418 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2419
2420 ## NOTE: Unlike spec's "bogus comment state", this implementation
2421 ## consumes characters one-by-one basis.
2422
2423 if ($self->{nc} == 0x003E) { # >
2424 if ($self->{in_subset}) {
2425
2426 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2427 } else {
2428
2429 $self->{state} = DATA_STATE;
2430 $self->{s_kwd} = '';
2431 }
2432
2433 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434 $self->{line_prev} = $self->{line};
2435 $self->{column_prev} = $self->{column};
2436 $self->{column}++;
2437 $self->{nc}
2438 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2439 } else {
2440 $self->{set_nc}->($self);
2441 }
2442
2443
2444 return ($self->{ct}); # comment
2445 redo A;
2446 } elsif ($self->{nc} == -1) {
2447 if ($self->{in_subset}) {
2448
2449 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2450 } else {
2451
2452 $self->{state} = DATA_STATE;
2453 $self->{s_kwd} = '';
2454 }
2455 ## reconsume
2456
2457 return ($self->{ct}); # comment
2458 redo A;
2459 } else {
2460
2461 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2462 $self->{read_until}->($self->{ct}->{data},
2463 q[>],
2464 length $self->{ct}->{data});
2465
2466 ## Stay in the state.
2467
2468 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2469 $self->{line_prev} = $self->{line};
2470 $self->{column_prev} = $self->{column};
2471 $self->{column}++;
2472 $self->{nc}
2473 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2474 } else {
2475 $self->{set_nc}->($self);
2476 }
2477
2478 redo A;
2479 }
2480 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2481 ## XML5: "Markup declaration state".
2482
2483 if ($self->{nc} == 0x002D) { # -
2484
2485 $self->{state} = MD_HYPHEN_STATE;
2486
2487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488 $self->{line_prev} = $self->{line};
2489 $self->{column_prev} = $self->{column};
2490 $self->{column}++;
2491 $self->{nc}
2492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2493 } else {
2494 $self->{set_nc}->($self);
2495 }
2496
2497 redo A;
2498 } elsif ($self->{nc} == 0x0044 or # D
2499 $self->{nc} == 0x0064) { # d
2500 ## ASCII case-insensitive.
2501
2502 $self->{state} = MD_DOCTYPE_STATE;
2503 $self->{kwd} = chr $self->{nc};
2504
2505 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2506 $self->{line_prev} = $self->{line};
2507 $self->{column_prev} = $self->{column};
2508 $self->{column}++;
2509 $self->{nc}
2510 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2511 } else {
2512 $self->{set_nc}->($self);
2513 }
2514
2515 redo A;
2516 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2517 $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2518 $self->{is_xml}) and
2519 $self->{nc} == 0x005B) { # [
2520
2521 $self->{state} = MD_CDATA_STATE;
2522 $self->{kwd} = '[';
2523
2524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525 $self->{line_prev} = $self->{line};
2526 $self->{column_prev} = $self->{column};
2527 $self->{column}++;
2528 $self->{nc}
2529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530 } else {
2531 $self->{set_nc}->($self);
2532 }
2533
2534 redo A;
2535 } else {
2536
2537 }
2538
2539 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2540 line => $self->{line_prev},
2541 column => $self->{column_prev} - 1);
2542 ## Reconsume.
2543 $self->{state} = BOGUS_COMMENT_STATE;
2544 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2545 line => $self->{line_prev},
2546 column => $self->{column_prev} - 1,
2547 };
2548 redo A;
2549 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2550 if ($self->{nc} == 0x002D) { # -
2551
2552 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2553 line => $self->{line_prev},
2554 column => $self->{column_prev} - 2,
2555 };
2556 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2557
2558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559 $self->{line_prev} = $self->{line};
2560 $self->{column_prev} = $self->{column};
2561 $self->{column}++;
2562 $self->{nc}
2563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564 } else {
2565 $self->{set_nc}->($self);
2566 }
2567
2568 redo A;
2569 } else {
2570
2571 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572 line => $self->{line_prev},
2573 column => $self->{column_prev} - 2);
2574 $self->{state} = BOGUS_COMMENT_STATE;
2575 ## Reconsume.
2576 $self->{ct} = {type => COMMENT_TOKEN,
2577 data => '-',
2578 line => $self->{line_prev},
2579 column => $self->{column_prev} - 2,
2580 };
2581 redo A;
2582 }
2583 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2584 ## ASCII case-insensitive.
2585 if ($self->{nc} == [
2586 undef,
2587 0x004F, # O
2588 0x0043, # C
2589 0x0054, # T
2590 0x0059, # Y
2591 0x0050, # P
2592 ]->[length $self->{kwd}] or
2593 $self->{nc} == [
2594 undef,
2595 0x006F, # o
2596 0x0063, # c
2597 0x0074, # t
2598 0x0079, # y
2599 0x0070, # p
2600 ]->[length $self->{kwd}]) {
2601
2602 ## Stay in the state.
2603 $self->{kwd} .= chr $self->{nc};
2604
2605 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606 $self->{line_prev} = $self->{line};
2607 $self->{column_prev} = $self->{column};
2608 $self->{column}++;
2609 $self->{nc}
2610 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2611 } else {
2612 $self->{set_nc}->($self);
2613 }
2614
2615 redo A;
2616 } elsif ((length $self->{kwd}) == 6 and
2617 ($self->{nc} == 0x0045 or # E
2618 $self->{nc} == 0x0065)) { # e
2619 if ($self->{is_xml} and
2620 ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2621
2622 ## XML5: case-sensitive.
2623 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2624 text => 'DOCTYPE',
2625 line => $self->{line_prev},
2626 column => $self->{column_prev} - 5);
2627 } else {
2628
2629 }
2630 $self->{state} = DOCTYPE_STATE;
2631 $self->{ct} = {type => DOCTYPE_TOKEN,
2632 quirks => 1,
2633 line => $self->{line_prev},
2634 column => $self->{column_prev} - 7,
2635 };
2636
2637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638 $self->{line_prev} = $self->{line};
2639 $self->{column_prev} = $self->{column};
2640 $self->{column}++;
2641 $self->{nc}
2642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2643 } else {
2644 $self->{set_nc}->($self);
2645 }
2646
2647 redo A;
2648 } else {
2649
2650 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2651 line => $self->{line_prev},
2652 column => $self->{column_prev} - 1 - length $self->{kwd});
2653 $self->{state} = BOGUS_COMMENT_STATE;
2654 ## Reconsume.
2655 $self->{ct} = {type => COMMENT_TOKEN,
2656 data => $self->{kwd},
2657 line => $self->{line_prev},
2658 column => $self->{column_prev} - 1 - length $self->{kwd},
2659 };
2660 redo A;
2661 }
2662 } elsif ($self->{state} == MD_CDATA_STATE) {
2663 if ($self->{nc} == {
2664 '[' => 0x0043, # C
2665 '[C' => 0x0044, # D
2666 '[CD' => 0x0041, # A
2667 '[CDA' => 0x0054, # T
2668 '[CDAT' => 0x0041, # A
2669 }->{$self->{kwd}}) {
2670
2671 ## Stay in the state.
2672 $self->{kwd} .= chr $self->{nc};
2673
2674 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2675 $self->{line_prev} = $self->{line};
2676 $self->{column_prev} = $self->{column};
2677 $self->{column}++;
2678 $self->{nc}
2679 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2680 } else {
2681 $self->{set_nc}->($self);
2682 }
2683
2684 redo A;
2685 } elsif ($self->{kwd} eq '[CDATA' and
2686 $self->{nc} == 0x005B) { # [
2687 if ($self->{is_xml} and
2688 not $self->{tainted} and
2689 @{$self->{open_elements} or []} == 0) {
2690
2691 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2692 line => $self->{line_prev},
2693 column => $self->{column_prev} - 7);
2694 $self->{tainted} = 1;
2695 } else {
2696
2697 }
2698
2699 $self->{ct} = {type => CHARACTER_TOKEN,
2700 data => '',
2701 line => $self->{line_prev},
2702 column => $self->{column_prev} - 7};
2703 $self->{state} = CDATA_SECTION_STATE;
2704
2705 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2706 $self->{line_prev} = $self->{line};
2707 $self->{column_prev} = $self->{column};
2708 $self->{column}++;
2709 $self->{nc}
2710 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2711 } else {
2712 $self->{set_nc}->($self);
2713 }
2714
2715 redo A;
2716 } else {
2717
2718 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2719 line => $self->{line_prev},
2720 column => $self->{column_prev} - 1 - length $self->{kwd});
2721 $self->{state} = BOGUS_COMMENT_STATE;
2722 ## Reconsume.
2723 $self->{ct} = {type => COMMENT_TOKEN,
2724 data => $self->{kwd},
2725 line => $self->{line_prev},
2726 column => $self->{column_prev} - 1 - length $self->{kwd},
2727 };
2728 redo A;
2729 }
2730 } elsif ($self->{state} == COMMENT_START_STATE) {
2731 if ($self->{nc} == 0x002D) { # -
2732
2733 $self->{state} = COMMENT_START_DASH_STATE;
2734
2735 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2736 $self->{line_prev} = $self->{line};
2737 $self->{column_prev} = $self->{column};
2738 $self->{column}++;
2739 $self->{nc}
2740 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2741 } else {
2742 $self->{set_nc}->($self);
2743 }
2744
2745 redo A;
2746 } elsif ($self->{nc} == 0x003E) { # >
2747 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2748 if ($self->{in_subset}) {
2749
2750 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2751 } else {
2752
2753 $self->{state} = DATA_STATE;
2754 $self->{s_kwd} = '';
2755 }
2756
2757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2758 $self->{line_prev} = $self->{line};
2759 $self->{column_prev} = $self->{column};
2760 $self->{column}++;
2761 $self->{nc}
2762 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2763 } else {
2764 $self->{set_nc}->($self);
2765 }
2766
2767
2768 return ($self->{ct}); # comment
2769
2770 redo A;
2771 } elsif ($self->{nc} == -1) {
2772 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2773 if ($self->{in_subset}) {
2774
2775 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2776 } else {
2777
2778 $self->{state} = DATA_STATE;
2779 $self->{s_kwd} = '';
2780 }
2781 ## reconsume
2782
2783 return ($self->{ct}); # comment
2784
2785 redo A;
2786 } else {
2787
2788 $self->{ct}->{data} # comment
2789 .= chr ($self->{nc});
2790 $self->{state} = COMMENT_STATE;
2791
2792 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2793 $self->{line_prev} = $self->{line};
2794 $self->{column_prev} = $self->{column};
2795 $self->{column}++;
2796 $self->{nc}
2797 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2798 } else {
2799 $self->{set_nc}->($self);
2800 }
2801
2802 redo A;
2803 }
2804 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2805 if ($self->{nc} == 0x002D) { # -
2806
2807 $self->{state} = COMMENT_END_STATE;
2808
2809 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2810 $self->{line_prev} = $self->{line};
2811 $self->{column_prev} = $self->{column};
2812 $self->{column}++;
2813 $self->{nc}
2814 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2815 } else {
2816 $self->{set_nc}->($self);
2817 }
2818
2819 redo A;
2820 } elsif ($self->{nc} == 0x003E) { # >
2821 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2822 if ($self->{in_subset}) {
2823
2824 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2825 } else {
2826
2827 $self->{state} = DATA_STATE;
2828 $self->{s_kwd} = '';
2829 }
2830
2831 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832 $self->{line_prev} = $self->{line};
2833 $self->{column_prev} = $self->{column};
2834 $self->{column}++;
2835 $self->{nc}
2836 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2837 } else {
2838 $self->{set_nc}->($self);
2839 }
2840
2841
2842 return ($self->{ct}); # comment
2843
2844 redo A;
2845 } elsif ($self->{nc} == -1) {
2846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2847 if ($self->{in_subset}) {
2848
2849 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850 } else {
2851
2852 $self->{state} = DATA_STATE;
2853 $self->{s_kwd} = '';
2854 }
2855 ## reconsume
2856
2857 return ($self->{ct}); # comment
2858
2859 redo A;
2860 } else {
2861
2862 $self->{ct}->{data} # comment
2863 .= '-' . chr ($self->{nc});
2864 $self->{state} = COMMENT_STATE;
2865
2866 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2867 $self->{line_prev} = $self->{line};
2868 $self->{column_prev} = $self->{column};
2869 $self->{column}++;
2870 $self->{nc}
2871 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2872 } else {
2873 $self->{set_nc}->($self);
2874 }
2875
2876 redo A;
2877 }
2878 } elsif ($self->{state} == COMMENT_STATE) {
2879 ## XML5: "Comment state" and "DOCTYPE comment state".
2880
2881 if ($self->{nc} == 0x002D) { # -
2882
2883 $self->{state} = COMMENT_END_DASH_STATE;
2884
2885 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2886 $self->{line_prev} = $self->{line};
2887 $self->{column_prev} = $self->{column};
2888 $self->{column}++;
2889 $self->{nc}
2890 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2891 } else {
2892 $self->{set_nc}->($self);
2893 }
2894
2895 redo A;
2896 } elsif ($self->{nc} == -1) {
2897 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2898 if ($self->{in_subset}) {
2899
2900 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2901 } else {
2902
2903 $self->{state} = DATA_STATE;
2904 $self->{s_kwd} = '';
2905 }
2906 ## reconsume
2907
2908 return ($self->{ct}); # comment
2909
2910 redo A;
2911 } else {
2912
2913 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2914 $self->{read_until}->($self->{ct}->{data},
2915 q[-],
2916 length $self->{ct}->{data});
2917
2918 ## Stay in the state
2919
2920 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2921 $self->{line_prev} = $self->{line};
2922 $self->{column_prev} = $self->{column};
2923 $self->{column}++;
2924 $self->{nc}
2925 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2926 } else {
2927 $self->{set_nc}->($self);
2928 }
2929
2930 redo A;
2931 }
2932 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2933 ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2934
2935 if ($self->{nc} == 0x002D) { # -
2936
2937 $self->{state} = COMMENT_END_STATE;
2938
2939 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2940 $self->{line_prev} = $self->{line};
2941 $self->{column_prev} = $self->{column};
2942 $self->{column}++;
2943 $self->{nc}
2944 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2945 } else {
2946 $self->{set_nc}->($self);
2947 }
2948
2949 redo A;
2950 } elsif ($self->{nc} == -1) {
2951 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952 if ($self->{in_subset}) {
2953
2954 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955 } else {
2956
2957 $self->{state} = DATA_STATE;
2958 $self->{s_kwd} = '';
2959 }
2960 ## reconsume
2961
2962 return ($self->{ct}); # comment
2963
2964 redo A;
2965 } else {
2966
2967 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2968 $self->{state} = COMMENT_STATE;
2969
2970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2971 $self->{line_prev} = $self->{line};
2972 $self->{column_prev} = $self->{column};
2973 $self->{column}++;
2974 $self->{nc}
2975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2976 } else {
2977 $self->{set_nc}->($self);
2978 }
2979
2980 redo A;
2981 }
2982 } elsif ($self->{state} == COMMENT_END_STATE or
2983 $self->{state} == COMMENT_END_BANG_STATE) {
2984 ## XML5: "Comment end state" and "DOCTYPE comment end state".
2985 ## (No comment end bang state.)
2986
2987 if ($self->{nc} == 0x003E) { # >
2988 if ($self->{in_subset}) {
2989
2990 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2991 } else {
2992
2993 $self->{state} = DATA_STATE;
2994 $self->{s_kwd} = '';
2995 }
2996
2997 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2998 $self->{line_prev} = $self->{line};
2999 $self->{column_prev} = $self->{column};
3000 $self->{column}++;
3001 $self->{nc}
3002 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3003 } else {
3004 $self->{set_nc}->($self);
3005 }
3006
3007
3008 return ($self->{ct}); # comment
3009
3010 redo A;
3011 } elsif ($self->{nc} == 0x002D) { # -
3012 if ($self->{state} == COMMENT_END_BANG_STATE) {
3013
3014 $self->{ct}->{data} .= '--!'; # comment
3015 $self->{state} = COMMENT_END_DASH_STATE;
3016 } else {
3017
3018 ## XML5: Not a parse error.
3019 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3020 line => $self->{line_prev},
3021 column => $self->{column_prev});
3022 $self->{ct}->{data} .= '-'; # comment
3023 ## Stay in the state
3024 }
3025
3026 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027 $self->{line_prev} = $self->{line};
3028 $self->{column_prev} = $self->{column};
3029 $self->{column}++;
3030 $self->{nc}
3031 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032 } else {
3033 $self->{set_nc}->($self);
3034 }
3035
3036 redo A;
3037 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3038 $is_space->{$self->{nc}}) {
3039
3040 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3041 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3042 $self->{state} = COMMENT_END_SPACE_STATE;
3043
3044 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3045 $self->{line_prev} = $self->{line};
3046 $self->{column_prev} = $self->{column};
3047 $self->{column}++;
3048 $self->{nc}
3049 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3050 } else {
3051 $self->{set_nc}->($self);
3052 }
3053
3054 redo A;
3055 } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3056 $self->{nc} == 0x0021) { # !
3057
3058 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3059 $self->{state} = COMMENT_END_BANG_STATE;
3060
3061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062 $self->{line_prev} = $self->{line};
3063 $self->{column_prev} = $self->{column};
3064 $self->{column}++;
3065 $self->{nc}
3066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067 } else {
3068 $self->{set_nc}->($self);
3069 }
3070
3071 redo A;
3072 } elsif ($self->{nc} == -1) {
3073 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3074 if ($self->{in_subset}) {
3075
3076 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3077 } else {
3078
3079 $self->{state} = DATA_STATE;
3080 $self->{s_kwd} = '';
3081 }
3082 ## Reconsume.
3083
3084 return ($self->{ct}); # comment
3085
3086 redo A;
3087 } else {
3088
3089 if ($self->{state} == COMMENT_END_BANG_STATE) {
3090 $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3091 } else {
3092 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3093 }
3094 $self->{state} = COMMENT_STATE;
3095
3096 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3097 $self->{line_prev} = $self->{line};
3098 $self->{column_prev} = $self->{column};
3099 $self->{column}++;
3100 $self->{nc}
3101 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3102 } else {
3103 $self->{set_nc}->($self);
3104 }
3105
3106 redo A;
3107 }
3108 } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3109 ## XML5: Not exist.
3110
3111 if ($self->{nc} == 0x003E) { # >
3112 if ($self->{in_subset}) {
3113
3114 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115 } else {
3116
3117 $self->{state} = DATA_STATE;
3118 $self->{s_kwd} = '';
3119 }
3120
3121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122 $self->{line_prev} = $self->{line};
3123 $self->{column_prev} = $self->{column};
3124 $self->{column}++;
3125 $self->{nc}
3126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127 } else {
3128 $self->{set_nc}->($self);
3129 }
3130
3131
3132 return ($self->{ct}); # comment
3133
3134 redo A;
3135 } elsif ($is_space->{$self->{nc}}) {
3136
3137 $self->{ct}->{data} .= chr ($self->{nc}); # comment
3138 ## Stay in the state.
3139
3140 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3141 $self->{line_prev} = $self->{line};
3142 $self->{column_prev} = $self->{column};
3143 $self->{column}++;
3144 $self->{nc}
3145 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3146 } else {
3147 $self->{set_nc}->($self);
3148 }
3149
3150 redo A;
3151 } elsif ($self->{nc} == -1) {
3152 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3153 if ($self->{in_subset}) {
3154
3155 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3156 } else {
3157
3158 $self->{state} = DATA_STATE;
3159 $self->{s_kwd} = '';
3160 }
3161 ## Reconsume.
3162
3163 return ($self->{ct}); # comment
3164
3165 redo A;
3166 } else {
3167
3168 $self->{ct}->{data} .= chr ($self->{nc}); # comment
3169 $self->{state} = COMMENT_STATE;
3170
3171 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172 $self->{line_prev} = $self->{line};
3173 $self->{column_prev} = $self->{column};
3174 $self->{column}++;
3175 $self->{nc}
3176 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177 } else {
3178 $self->{set_nc}->($self);
3179 }
3180
3181 redo A;
3182 }
3183 } elsif ($self->{state} == DOCTYPE_STATE) {
3184 if ($is_space->{$self->{nc}}) {
3185
3186 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3187
3188 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3189 $self->{line_prev} = $self->{line};
3190 $self->{column_prev} = $self->{column};
3191 $self->{column}++;
3192 $self->{nc}
3193 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3194 } else {
3195 $self->{set_nc}->($self);
3196 }
3197
3198 redo A;
3199 } elsif ($self->{nc} == -1) {
3200
3201 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3202 $self->{ct}->{quirks} = 1;
3203
3204 $self->{state} = DATA_STATE;
3205 ## Reconsume.
3206 return ($self->{ct}); # DOCTYPE (quirks)
3207
3208 redo A;
3209 } else {
3210
3211 ## XML5: Swith to the bogus comment state.
3212 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3213 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3214 ## reconsume
3215 redo A;
3216 }
3217 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3218 ## XML5: "DOCTYPE root name before state".
3219
3220 if ($is_space->{$self->{nc}}) {
3221
3222 ## Stay in the state
3223
3224 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3225 $self->{line_prev} = $self->{line};
3226 $self->{column_prev} = $self->{column};
3227 $self->{column}++;
3228 $self->{nc}
3229 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3230 } else {
3231 $self->{set_nc}->($self);
3232 }
3233
3234 redo A;
3235 } elsif ($self->{nc} == 0x003E) { # >
3236
3237 ## XML5: No parse error.
3238 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3239 $self->{state} = DATA_STATE;
3240 $self->{s_kwd} = '';
3241
3242 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3243 $self->{line_prev} = $self->{line};
3244 $self->{column_prev} = $self->{column};
3245 $self->{column}++;
3246 $self->{nc}
3247 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3248 } else {
3249 $self->{set_nc}->($self);
3250 }
3251
3252
3253 return ($self->{ct}); # DOCTYPE (quirks)
3254
3255 redo A;
3256 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3257
3258 $self->{ct}->{name} # DOCTYPE
3259 = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3260 delete $self->{ct}->{quirks};
3261 $self->{state} = DOCTYPE_NAME_STATE;
3262
3263 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264 $self->{line_prev} = $self->{line};
3265 $self->{column_prev} = $self->{column};
3266 $self->{column}++;
3267 $self->{nc}
3268 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269 } else {
3270 $self->{set_nc}->($self);
3271 }
3272
3273 redo A;
3274 } elsif ($self->{nc} == -1) {
3275
3276 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3277 $self->{state} = DATA_STATE;
3278 $self->{s_kwd} = '';
3279 ## reconsume
3280
3281 return ($self->{ct}); # DOCTYPE (quirks)
3282
3283 redo A;
3284 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3285
3286 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3287 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3288 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3289 $self->{in_subset} = 1;
3290
3291 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3292 $self->{line_prev} = $self->{line};
3293 $self->{column_prev} = $self->{column};
3294 $self->{column}++;
3295 $self->{nc}
3296 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3297 } else {
3298 $self->{set_nc}->($self);
3299 }
3300
3301 return ($self->{ct}); # DOCTYPE
3302 redo A;
3303 } else {
3304
3305 $self->{ct}->{name} = chr $self->{nc};
3306 delete $self->{ct}->{quirks};
3307 $self->{state} = DOCTYPE_NAME_STATE;
3308
3309 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3310 $self->{line_prev} = $self->{line};
3311 $self->{column_prev} = $self->{column};
3312 $self->{column}++;
3313 $self->{nc}
3314 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3315 } else {
3316 $self->{set_nc}->($self);
3317 }
3318
3319 redo A;
3320 }
3321 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3322 ## XML5: "DOCTYPE root name state".
3323
3324 ## ISSUE: Redundant "First," in the spec.
3325
3326 if ($is_space->{$self->{nc}}) {
3327
3328 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3329
3330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3331 $self->{line_prev} = $self->{line};
3332 $self->{column_prev} = $self->{column};
3333 $self->{column}++;
3334 $self->{nc}
3335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3336 } else {
3337 $self->{set_nc}->($self);
3338 }
3339
3340 redo A;
3341 } elsif ($self->{nc} == 0x003E) { # >
3342
3343 $self->{state} = DATA_STATE;
3344 $self->{s_kwd} = '';
3345
3346 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3347 $self->{line_prev} = $self->{line};
3348 $self->{column_prev} = $self->{column};
3349 $self->{column}++;
3350 $self->{nc}
3351 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3352 } else {
3353 $self->{set_nc}->($self);
3354 }
3355
3356
3357 return ($self->{ct}); # DOCTYPE
3358
3359 redo A;
3360 } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3361
3362 $self->{ct}->{name} # DOCTYPE
3363 .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3364 delete $self->{ct}->{quirks};
3365 ## Stay in the state.
3366
3367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3368 $self->{line_prev} = $self->{line};
3369 $self->{column_prev} = $self->{column};
3370 $self->{column}++;
3371 $self->{nc}
3372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3373 } else {
3374 $self->{set_nc}->($self);
3375 }
3376
3377 redo A;
3378 } elsif ($self->{nc} == -1) {
3379
3380 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3381 $self->{state} = DATA_STATE;
3382 $self->{s_kwd} = '';
3383 ## reconsume
3384
3385 $self->{ct}->{quirks} = 1;
3386 return ($self->{ct}); # DOCTYPE
3387
3388 redo A;
3389 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3390
3391 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3392 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3393 $self->{in_subset} = 1;
3394
3395 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396 $self->{line_prev} = $self->{line};
3397 $self->{column_prev} = $self->{column};
3398 $self->{column}++;
3399 $self->{nc}
3400 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401 } else {
3402 $self->{set_nc}->($self);
3403 }
3404
3405 return ($self->{ct}); # DOCTYPE
3406 redo A;
3407 } else {
3408
3409 $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3410 ## Stay in the state.
3411
3412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3413 $self->{line_prev} = $self->{line};
3414 $self->{column_prev} = $self->{column};
3415 $self->{column}++;
3416 $self->{nc}
3417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3418 } else {
3419 $self->{set_nc}->($self);
3420 }
3421
3422 redo A;
3423 }
3424 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3425 ## XML5: Corresponding to XML5's "DOCTYPE root name after
3426 ## state", but implemented differently.
3427
3428 if ($is_space->{$self->{nc}}) {
3429
3430 ## Stay in the state
3431
3432 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3433 $self->{line_prev} = $self->{line};
3434 $self->{column_prev} = $self->{column};
3435 $self->{column}++;
3436 $self->{nc}
3437 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3438 } else {
3439 $self->{set_nc}->($self);
3440 }
3441
3442 redo A;
3443 } elsif ($self->{nc} == 0x003E) { # >
3444 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445
3446 $self->{state} = DATA_STATE;
3447 $self->{s_kwd} = '';
3448 } else {
3449
3450 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3451 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3452 }
3453
3454
3455 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3456 $self->{line_prev} = $self->{line};
3457 $self->{column_prev} = $self->{column};
3458 $self->{column}++;
3459 $self->{nc}
3460 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3461 } else {
3462 $self->{set_nc}->($self);
3463 }
3464
3465 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3466 redo A;
3467 } elsif ($self->{nc} == -1) {
3468 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3469
3470 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3471 $self->{state} = DATA_STATE;
3472 $self->{s_kwd} = '';
3473 $self->{ct}->{quirks} = 1;
3474 } else {
3475
3476 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3477 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3478 }
3479
3480 ## Reconsume.
3481 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3482 redo A;
3483 } elsif ($self->{nc} == 0x0050 or # P
3484 $self->{nc} == 0x0070) { # p
3485
3486 $self->{state} = PUBLIC_STATE;
3487 $self->{kwd} = chr $self->{nc};
3488
3489 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490 $self->{line_prev} = $self->{line};
3491 $self->{column_prev} = $self->{column};
3492 $self->{column}++;
3493 $self->{nc}
3494 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495 } else {
3496 $self->{set_nc}->($self);
3497 }
3498
3499 redo A;
3500 } elsif ($self->{nc} == 0x0053 or # S
3501 $self->{nc} == 0x0073) { # s
3502
3503 $self->{state} = SYSTEM_STATE;
3504 $self->{kwd} = chr $self->{nc};
3505
3506 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3507 $self->{line_prev} = $self->{line};
3508 $self->{column_prev} = $self->{column};
3509 $self->{column}++;
3510 $self->{nc}
3511 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3512 } else {
3513 $self->{set_nc}->($self);
3514 }
3515
3516 redo A;
3517 } elsif ($self->{nc} == 0x0022 and # "
3518 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3519 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3520
3521 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3522 $self->{ct}->{value} = ''; # ENTITY
3523
3524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3525 $self->{line_prev} = $self->{line};
3526 $self->{column_prev} = $self->{column};
3527 $self->{column}++;
3528 $self->{nc}
3529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3530 } else {
3531 $self->{set_nc}->($self);
3532 }
3533
3534 redo A;
3535 } elsif ($self->{nc} == 0x0027 and # '
3536 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3537 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3538
3539 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3540 $self->{ct}->{value} = ''; # ENTITY
3541
3542 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3543 $self->{line_prev} = $self->{line};
3544 $self->{column_prev} = $self->{column};
3545 $self->{column}++;
3546 $self->{nc}
3547 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3548 } else {
3549 $self->{set_nc}->($self);
3550 }
3551
3552 redo A;
3553 } elsif ($self->{is_xml} and
3554 $self->{ct}->{type} == DOCTYPE_TOKEN and
3555 $self->{nc} == 0x005B) { # [
3556
3557 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3558 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3559 $self->{in_subset} = 1;
3560
3561 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3562 $self->{line_prev} = $self->{line};
3563 $self->{column_prev} = $self->{column};
3564 $self->{column}++;
3565 $self->{nc}
3566 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3567 } else {
3568 $self->{set_nc}->($self);
3569 }
3570
3571 return ($self->{ct}); # DOCTYPE
3572 redo A;
3573 } else {
3574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3575
3576 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3577
3578 $self->{ct}->{quirks} = 1;
3579 $self->{state} = BOGUS_DOCTYPE_STATE;
3580 } else {
3581
3582 $self->{state} = BOGUS_MD_STATE;
3583 }
3584
3585
3586 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3587 $self->{line_prev} = $self->{line};
3588 $self->{column_prev} = $self->{column};
3589 $self->{column}++;
3590 $self->{nc}
3591 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3592 } else {
3593 $self->{set_nc}->($self);
3594 }
3595
3596 redo A;
3597 }
3598 } elsif ($self->{state} == PUBLIC_STATE) {
3599 ## ASCII case-insensitive
3600 if ($self->{nc} == [
3601 undef,
3602 0x0055, # U
3603 0x0042, # B
3604 0x004C, # L
3605 0x0049, # I
3606 ]->[length $self->{kwd}] or
3607 $self->{nc} == [
3608 undef,
3609 0x0075, # u
3610 0x0062, # b
3611 0x006C, # l
3612 0x0069, # i
3613 ]->[length $self->{kwd}]) {
3614
3615 ## Stay in the state.
3616 $self->{kwd} .= chr $self->{nc};
3617
3618 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3619 $self->{line_prev} = $self->{line};
3620 $self->{column_prev} = $self->{column};
3621 $self->{column}++;
3622 $self->{nc}
3623 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3624 } else {
3625 $self->{set_nc}->($self);
3626 }
3627
3628 redo A;
3629 } elsif ((length $self->{kwd}) == 5 and
3630 ($self->{nc} == 0x0043 or # C
3631 $self->{nc} == 0x0063)) { # c
3632 if ($self->{is_xml} and
3633 ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3634
3635 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3636 text => 'PUBLIC',
3637 line => $self->{line_prev},
3638 column => $self->{column_prev} - 4);
3639 } else {
3640
3641 }
3642 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3643
3644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3645 $self->{line_prev} = $self->{line};
3646 $self->{column_prev} = $self->{column};
3647 $self->{column}++;
3648 $self->{nc}
3649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3650 } else {
3651 $self->{set_nc}->($self);
3652 }
3653
3654 redo A;
3655 } else {
3656 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3657 line => $self->{line_prev},
3658 column => $self->{column_prev} + 1 - length $self->{kwd});
3659 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3660
3661 $self->{ct}->{quirks} = 1;
3662 $self->{state} = BOGUS_DOCTYPE_STATE;
3663 } else {
3664
3665 $self->{state} = BOGUS_MD_STATE;
3666 }
3667 ## Reconsume.
3668 redo A;
3669 }
3670 } elsif ($self->{state} == SYSTEM_STATE) {
3671 ## ASCII case-insensitive
3672 if ($self->{nc} == [
3673 undef,
3674 0x0059, # Y
3675 0x0053, # S
3676 0x0054, # T
3677 0x0045, # E
3678 ]->[length $self->{kwd}] or
3679 $self->{nc} == [
3680 undef,
3681 0x0079, # y
3682 0x0073, # s
3683 0x0074, # t
3684 0x0065, # e
3685 ]->[length $self->{kwd}]) {
3686
3687 ## Stay in the state.
3688 $self->{kwd} .= chr $self->{nc};
3689
3690 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3691 $self->{line_prev} = $self->{line};
3692 $self->{column_prev} = $self->{column};
3693 $self->{column}++;
3694 $self->{nc}
3695 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3696 } else {
3697 $self->{set_nc}->($self);
3698 }
3699
3700 redo A;
3701 } elsif ((length $self->{kwd}) == 5 and
3702 ($self->{nc} == 0x004D or # M
3703 $self->{nc} == 0x006D)) { # m
3704 if ($self->{is_xml} and
3705 ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3706
3707 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3708 text => 'SYSTEM',
3709 line => $self->{line_prev},
3710 column => $self->{column_prev} - 4);
3711 } else {
3712
3713 }
3714 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3715
3716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3717 $self->{line_prev} = $self->{line};
3718 $self->{column_prev} = $self->{column};
3719 $self->{column}++;
3720 $self->{nc}
3721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3722 } else {
3723 $self->{set_nc}->($self);
3724 }
3725
3726 redo A;
3727 } else {
3728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3729 line => $self->{line_prev},
3730 column => $self->{column_prev} + 1 - length $self->{kwd});
3731 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3732
3733 $self->{ct}->{quirks} = 1;
3734 $self->{state} = BOGUS_DOCTYPE_STATE;
3735 } else {
3736
3737 $self->{state} = BOGUS_MD_STATE;
3738 }
3739 ## Reconsume.
3740 redo A;
3741 }
3742 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3743 if ($is_space->{$self->{nc}}) {
3744
3745 ## Stay in the state
3746
3747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3748 $self->{line_prev} = $self->{line};
3749 $self->{column_prev} = $self->{column};
3750 $self->{column}++;
3751 $self->{nc}
3752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3753 } else {
3754 $self->{set_nc}->($self);
3755 }
3756
3757 redo A;
3758 } elsif ($self->{nc} eq 0x0022) { # "
3759
3760 $self->{ct}->{pubid} = ''; # DOCTYPE
3761 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3762
3763 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3764 $self->{line_prev} = $self->{line};
3765 $self->{column_prev} = $self->{column};
3766 $self->{column}++;
3767 $self->{nc}
3768 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3769 } else {
3770 $self->{set_nc}->($self);
3771 }
3772
3773 redo A;
3774 } elsif ($self->{nc} eq 0x0027) { # '
3775
3776 $self->{ct}->{pubid} = ''; # DOCTYPE
3777 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3778
3779 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3780 $self->{line_prev} = $self->{line};
3781 $self->{column_prev} = $self->{column};
3782 $self->{column}++;
3783 $self->{nc}
3784 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3785 } else {
3786 $self->{set_nc}->($self);
3787 }
3788
3789 redo A;
3790 } elsif ($self->{nc} eq 0x003E) { # >
3791 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3792
3793 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794
3795 $self->{state} = DATA_STATE;
3796 $self->{s_kwd} = '';
3797 $self->{ct}->{quirks} = 1;
3798 } else {
3799
3800 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801 }
3802
3803
3804 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3805 $self->{line_prev} = $self->{line};
3806 $self->{column_prev} = $self->{column};
3807 $self->{column}++;
3808 $self->{nc}
3809 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3810 } else {
3811 $self->{set_nc}->($self);
3812 }
3813
3814 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3815 redo A;
3816 } elsif ($self->{nc} == -1) {
3817 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3818
3819 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3820 $self->{state} = DATA_STATE;
3821 $self->{s_kwd} = '';
3822 $self->{ct}->{quirks} = 1;
3823 } else {
3824
3825 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3826 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3827 }
3828
3829 ## reconsume
3830 return ($self->{ct}); # DOCTYPE
3831 redo A;
3832 } elsif ($self->{is_xml} and
3833 $self->{ct}->{type} == DOCTYPE_TOKEN and
3834 $self->{nc} == 0x005B) { # [
3835
3836 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3837 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3838 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3839 $self->{in_subset} = 1;
3840
3841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3842 $self->{line_prev} = $self->{line};
3843 $self->{column_prev} = $self->{column};
3844 $self->{column}++;
3845 $self->{nc}
3846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3847 } else {
3848 $self->{set_nc}->($self);
3849 }
3850
3851 return ($self->{ct}); # DOCTYPE
3852 redo A;
3853 } else {
3854 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3855
3856 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3857
3858 $self->{ct}->{quirks} = 1;
3859 $self->{state} = BOGUS_DOCTYPE_STATE;
3860 } else {
3861
3862 $self->{state} = BOGUS_MD_STATE;
3863 }
3864
3865
3866 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3867 $self->{line_prev} = $self->{line};
3868 $self->{column_prev} = $self->{column};
3869 $self->{column}++;
3870 $self->{nc}
3871 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3872 } else {
3873 $self->{set_nc}->($self);
3874 }
3875
3876 redo A;
3877 }
3878 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3879 if ($self->{nc} == 0x0022) { # "
3880
3881 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3882
3883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3884 $self->{line_prev} = $self->{line};
3885 $self->{column_prev} = $self->{column};
3886 $self->{column}++;
3887 $self->{nc}
3888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3889 } else {
3890 $self->{set_nc}->($self);
3891 }
3892
3893 redo A;
3894 } elsif ($self->{nc} == 0x003E) { # >
3895 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3896
3897 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3898
3899 $self->{state} = DATA_STATE;
3900 $self->{s_kwd} = '';
3901 $self->{ct}->{quirks} = 1;
3902 } else {
3903
3904 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3905 }
3906
3907
3908 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3909 $self->{line_prev} = $self->{line};
3910 $self->{column_prev} = $self->{column};
3911 $self->{column}++;
3912 $self->{nc}
3913 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3914 } else {
3915 $self->{set_nc}->($self);
3916 }
3917
3918 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3919 redo A;
3920 } elsif ($self->{nc} == -1) {
3921 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3922
3923 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3924
3925 $self->{state} = DATA_STATE;
3926 $self->{s_kwd} = '';
3927 $self->{ct}->{quirks} = 1;
3928 } else {
3929
3930 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3931 }
3932
3933 ## Reconsume.
3934 return ($self->{ct}); # DOCTYPE
3935 redo A;
3936 } else {
3937
3938 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
3939 $self->{read_until}->($self->{ct}->{pubid}, q[">],
3940 length $self->{ct}->{pubid});
3941
3942 ## Stay in the state
3943
3944 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3945 $self->{line_prev} = $self->{line};
3946 $self->{column_prev} = $self->{column};
3947 $self->{column}++;
3948 $self->{nc}
3949 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3950 } else {
3951 $self->{set_nc}->($self);
3952 }
3953
3954 redo A;
3955 }
3956 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3957 if ($self->{nc} == 0x0027) { # '
3958
3959 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3960
3961 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3962 $self->{line_prev} = $self->{line};
3963 $self->{column_prev} = $self->{column};
3964 $self->{column}++;
3965 $self->{nc}
3966 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3967 } else {
3968 $self->{set_nc}->($self);
3969 }
3970
3971 redo A;
3972 } elsif ($self->{nc} == 0x003E) { # >
3973 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3974
3975 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3976
3977 $self->{state} = DATA_STATE;
3978 $self->{s_kwd} = '';
3979 $self->{ct}->{quirks} = 1;
3980 } else {
3981
3982 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3983 }
3984
3985
3986 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987 $self->{line_prev} = $self->{line};
3988 $self->{column_prev} = $self->{column};
3989 $self->{column}++;
3990 $self->{nc}
3991 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3992 } else {
3993 $self->{set_nc}->($self);
3994 }
3995
3996 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3997 redo A;
3998 } elsif ($self->{nc} == -1) {
3999 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
4000
4001 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4002
4003 $self->{state} = DATA_STATE;
4004 $self->{s_kwd} = '';
4005 $self->{ct}->{quirks} = 1;
4006 } else {
4007
4008 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4009 }
4010
4011 ## reconsume
4012 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4013 redo A;
4014 } else {
4015
4016 $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4017 $self->{read_until}->($self->{ct}->{pubid}, q['>],
4018 length $self->{ct}->{pubid});
4019
4020 ## Stay in the state
4021
4022 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4023 $self->{line_prev} = $self->{line};
4024 $self->{column_prev} = $self->{column};
4025 $self->{column}++;
4026 $self->{nc}
4027 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4028 } else {
4029 $self->{set_nc}->($self);
4030 }
4031
4032 redo A;
4033 }
4034 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
4035 if ($is_space->{$self->{nc}}) {
4036
4037 ## Stay in the state
4038
4039 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4040 $self->{line_prev} = $self->{line};
4041 $self->{column_prev} = $self->{column};
4042 $self->{column}++;
4043 $self->{nc}
4044 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4045 } else {
4046 $self->{set_nc}->($self);
4047 }
4048
4049 redo A;
4050 } elsif ($self->{nc} == 0x0022) { # "
4051
4052 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4053 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4054
4055 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4056 $self->{line_prev} = $self->{line};
4057 $self->{column_prev} = $self->{column};
4058 $self->{column}++;
4059 $self->{nc}
4060 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4061 } else {
4062 $self->{set_nc}->($self);
4063 }
4064
4065 redo A;
4066 } elsif ($self->{nc} == 0x0027) { # '
4067
4068 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4069 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4070
4071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4072 $self->{line_prev} = $self->{line};
4073 $self->{column_prev} = $self->{column};
4074 $self->{column}++;
4075 $self->{nc}
4076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4077 } else {
4078 $self->{set_nc}->($self);
4079 }
4080
4081 redo A;
4082 } elsif ($self->{nc} == 0x003E) { # >
4083 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4084 if ($self->{is_xml}) {
4085
4086 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4087 } else {
4088
4089 }
4090 $self->{state} = DATA_STATE;
4091 $self->{s_kwd} = '';
4092 } else {
4093 if ($self->{ct}->{type} == NOTATION_TOKEN) {
4094
4095 } else {
4096
4097 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4098 }
4099 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4100 }
4101
4102
4103 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4104 $self->{line_prev} = $self->{line};
4105 $self->{column_prev} = $self->{column};
4106 $self->{column}++;
4107 $self->{nc}
4108 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4109 } else {
4110 $self->{set_nc}->($self);
4111 }
4112
4113 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4114 redo A;
4115 } elsif ($self->{nc} == -1) {
4116 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4117
4118 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4119
4120 $self->{state} = DATA_STATE;
4121 $self->{s_kwd} = '';
4122 $self->{ct}->{quirks} = 1;
4123 } else {
4124 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4125 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4126 }
4127
4128 ## reconsume
4129 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4130 redo A;
4131 } elsif ($self->{is_xml} and
4132 $self->{ct}->{type} == DOCTYPE_TOKEN and
4133 $self->{nc} == 0x005B) { # [
4134
4135 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4136 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4137 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4138 $self->{in_subset} = 1;
4139
4140 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4141 $self->{line_prev} = $self->{line};
4142 $self->{column_prev} = $self->{column};
4143 $self->{column}++;
4144 $self->{nc}
4145 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4146 } else {
4147 $self->{set_nc}->($self);
4148 }
4149
4150 return ($self->{ct}); # DOCTYPE
4151 redo A;
4152 } else {
4153 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
4154
4155 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4156
4157 $self->{ct}->{quirks} = 1;
4158 $self->{state} = BOGUS_DOCTYPE_STATE;
4159 } else {
4160
4161 $self->{state} = BOGUS_MD_STATE;
4162 }
4163
4164
4165 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4166 $self->{line_prev} = $self->{line};
4167 $self->{column_prev} = $self->{column};
4168 $self->{column}++;
4169 $self->{nc}
4170 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4171 } else {
4172 $self->{set_nc}->($self);
4173 }
4174
4175 redo A;
4176 }
4177 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4178 if ($is_space->{$self->{nc}}) {
4179
4180 ## Stay in the state
4181
4182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4183 $self->{line_prev} = $self->{line};
4184 $self->{column_prev} = $self->{column};
4185 $self->{column}++;
4186 $self->{nc}
4187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4188 } else {
4189 $self->{set_nc}->($self);
4190 }
4191
4192 redo A;
4193 } elsif ($self->{nc} == 0x0022) { # "
4194
4195 $self->{ct}->{sysid} = ''; # DOCTYPE
4196 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4197
4198 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4199 $self->{line_prev} = $self->{line};
4200 $self->{column_prev} = $self->{column};
4201 $self->{column}++;
4202 $self->{nc}
4203 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4204 } else {
4205 $self->{set_nc}->($self);
4206 }
4207
4208 redo A;
4209 } elsif ($self->{nc} == 0x0027) { # '
4210
4211 $self->{ct}->{sysid} = ''; # DOCTYPE
4212 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4213
4214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4215 $self->{line_prev} = $self->{line};
4216 $self->{column_prev} = $self->{column};
4217 $self->{column}++;
4218 $self->{nc}
4219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4220 } else {
4221 $self->{set_nc}->($self);
4222 }
4223
4224 redo A;
4225 } elsif ($self->{nc} == 0x003E) { # >
4226 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4227
4228 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4229 $self->{line_prev} = $self->{line};
4230 $self->{column_prev} = $self->{column};
4231 $self->{column}++;
4232 $self->{nc}
4233 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4234 } else {
4235 $self->{set_nc}->($self);
4236 }
4237
4238
4239 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4240
4241 $self->{state} = DATA_STATE;
4242 $self->{s_kwd} = '';
4243 $self->{ct}->{quirks} = 1;
4244 } else {
4245
4246 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247 }
4248
4249 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4250 redo A;
4251 } elsif ($self->{nc} == -1) {
4252 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253
4254 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4255 $self->{state} = DATA_STATE;
4256 $self->{s_kwd} = '';
4257 $self->{ct}->{quirks} = 1;
4258 } else {
4259
4260 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4261 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4262 }
4263
4264 ## reconsume
4265 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4266 redo A;
4267 } elsif ($self->{is_xml} and
4268 $self->{ct}->{type} == DOCTYPE_TOKEN and
4269 $self->{nc} == 0x005B) { # [
4270
4271 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4272
4273 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4274 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4275 $self->{in_subset} = 1;
4276
4277 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4278 $self->{line_prev} = $self->{line};
4279 $self->{column_prev} = $self->{column};
4280 $self->{column}++;
4281 $self->{nc}
4282 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4283 } else {
4284 $self->{set_nc}->($self);
4285 }
4286
4287 return ($self->{ct}); # DOCTYPE
4288 redo A;
4289 } else {
4290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4291
4292 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4293
4294 $self->{ct}->{quirks} = 1;
4295 $self->{state} = BOGUS_DOCTYPE_STATE;
4296 } else {
4297
4298 $self->{state} = BOGUS_MD_STATE;
4299 }
4300
4301
4302 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4303 $self->{line_prev} = $self->{line};
4304 $self->{column_prev} = $self->{column};
4305 $self->{column}++;
4306 $self->{nc}
4307 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4308 } else {
4309 $self->{set_nc}->($self);
4310 }
4311
4312 redo A;
4313 }
4314 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4315 if ($self->{nc} == 0x0022) { # "
4316
4317 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4318
4319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4320 $self->{line_prev} = $self->{line};
4321 $self->{column_prev} = $self->{column};
4322 $self->{column}++;
4323 $self->{nc}
4324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4325 } else {
4326 $self->{set_nc}->($self);
4327 }
4328
4329 redo A;
4330 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4332
4333 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4334
4335 $self->{state} = DATA_STATE;
4336 $self->{s_kwd} = '';
4337 $self->{ct}->{quirks} = 1;
4338 } else {
4339
4340 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4341 }
4342
4343
4344 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4345 $self->{line_prev} = $self->{line};
4346 $self->{column_prev} = $self->{column};
4347 $self->{column}++;
4348 $self->{nc}
4349 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4350 } else {
4351 $self->{set_nc}->($self);
4352 }
4353
4354 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4355 redo A;
4356 } elsif ($self->{nc} == -1) {
4357 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4358
4359 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4360
4361 $self->{state} = DATA_STATE;
4362 $self->{s_kwd} = '';
4363 $self->{ct}->{quirks} = 1;
4364 } else {
4365
4366 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367 }
4368
4369 ## reconsume
4370 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4371 redo A;
4372 } else {
4373
4374 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4375 $self->{read_until}->($self->{ct}->{sysid}, q[">],
4376 length $self->{ct}->{sysid});
4377
4378 ## Stay in the state
4379
4380 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4381 $self->{line_prev} = $self->{line};
4382 $self->{column_prev} = $self->{column};
4383 $self->{column}++;
4384 $self->{nc}
4385 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4386 } else {
4387 $self->{set_nc}->($self);
4388 }
4389
4390 redo A;
4391 }
4392 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4393 if ($self->{nc} == 0x0027) { # '
4394
4395 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4396
4397 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4398 $self->{line_prev} = $self->{line};
4399 $self->{column_prev} = $self->{column};
4400 $self->{column}++;
4401 $self->{nc}
4402 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4403 } else {
4404 $self->{set_nc}->($self);
4405 }
4406
4407 redo A;
4408 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4409
4410 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4411
4412 $self->{state} = DATA_STATE;
4413 $self->{s_kwd} = '';
4414
4415 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4416 $self->{line_prev} = $self->{line};
4417 $self->{column_prev} = $self->{column};
4418 $self->{column}++;
4419 $self->{nc}
4420 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4421 } else {
4422 $self->{set_nc}->($self);
4423 }
4424
4425
4426 $self->{ct}->{quirks} = 1;
4427 return ($self->{ct}); # DOCTYPE
4428
4429 redo A;
4430 } elsif ($self->{nc} == -1) {
4431 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4432
4433 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4434
4435 $self->{state} = DATA_STATE;
4436 $self->{s_kwd} = '';
4437 $self->{ct}->{quirks} = 1;
4438 } else {
4439
4440 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4441 }
4442
4443 ## reconsume
4444 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4445 redo A;
4446 } else {
4447
4448 $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
4449 $self->{read_until}->($self->{ct}->{sysid}, q['>],
4450 length $self->{ct}->{sysid});
4451
4452 ## Stay in the state
4453
4454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4455 $self->{line_prev} = $self->{line};
4456 $self->{column_prev} = $self->{column};
4457 $self->{column}++;
4458 $self->{nc}
4459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4460 } else {
4461 $self->{set_nc}->($self);
4462 }
4463
4464 redo A;
4465 }
4466 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4467 if ($is_space->{$self->{nc}}) {
4468 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4469
4470 $self->{state} = BEFORE_NDATA_STATE;
4471 } else {
4472
4473 ## Stay in the state
4474 }
4475
4476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4477 $self->{line_prev} = $self->{line};
4478 $self->{column_prev} = $self->{column};
4479 $self->{column}++;
4480 $self->{nc}
4481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4482 } else {
4483 $self->{set_nc}->($self);
4484 }
4485
4486 redo A;
4487 } elsif ($self->{nc} == 0x003E) { # >
4488 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4489
4490 $self->{state} = DATA_STATE;
4491 $self->{s_kwd} = '';
4492 } else {
4493
4494 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4495 }
4496
4497
4498 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499 $self->{line_prev} = $self->{line};
4500 $self->{column_prev} = $self->{column};
4501 $self->{column}++;
4502 $self->{nc}
4503 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504 } else {
4505 $self->{set_nc}->($self);
4506 }
4507
4508 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4509 redo A;
4510 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4511 ($self->{nc} == 0x004E or # N
4512 $self->{nc} == 0x006E)) { # n
4513
4514 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4515 $self->{state} = NDATA_STATE;
4516 $self->{kwd} = chr $self->{nc};
4517
4518 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4519 $self->{line_prev} = $self->{line};
4520 $self->{column_prev} = $self->{column};
4521 $self->{column}++;
4522 $self->{nc}
4523 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4524 } else {
4525 $self->{set_nc}->($self);
4526 }
4527
4528 redo A;
4529 } elsif ($self->{nc} == -1) {
4530 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4531
4532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4533 $self->{state} = DATA_STATE;
4534 $self->{s_kwd} = '';
4535 $self->{ct}->{quirks} = 1;
4536 } else {
4537
4538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4539 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540 }
4541
4542 ## reconsume
4543 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4544 redo A;
4545 } elsif ($self->{is_xml} and
4546 $self->{ct}->{type} == DOCTYPE_TOKEN and
4547 $self->{nc} == 0x005B) { # [
4548
4549 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4550 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4551 $self->{in_subset} = 1;
4552
4553 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4554 $self->{line_prev} = $self->{line};
4555 $self->{column_prev} = $self->{column};
4556 $self->{column}++;
4557 $self->{nc}
4558 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4559 } else {
4560 $self->{set_nc}->($self);
4561 }
4562
4563 return ($self->{ct}); # DOCTYPE
4564 redo A;
4565 } else {
4566 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4567
4568 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4569
4570 #$self->{ct}->{quirks} = 1;
4571 $self->{state} = BOGUS_DOCTYPE_STATE;
4572 } else {
4573
4574 $self->{state} = BOGUS_MD_STATE;
4575 }
4576
4577
4578 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4579 $self->{line_prev} = $self->{line};
4580 $self->{column_prev} = $self->{column};
4581 $self->{column}++;
4582 $self->{nc}
4583 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4584 } else {
4585 $self->{set_nc}->($self);
4586 }
4587
4588 redo A;
4589 }
4590 } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4591 if ($is_space->{$self->{nc}}) {
4592
4593 ## Stay in the state.
4594
4595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4596 $self->{line_prev} = $self->{line};
4597 $self->{column_prev} = $self->{column};
4598 $self->{column}++;
4599 $self->{nc}
4600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4601 } else {
4602 $self->{set_nc}->($self);
4603 }
4604
4605 redo A;
4606 } elsif ($self->{nc} == 0x003E) { # >
4607
4608 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4609
4610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611 $self->{line_prev} = $self->{line};
4612 $self->{column_prev} = $self->{column};
4613 $self->{column}++;
4614 $self->{nc}
4615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616 } else {
4617 $self->{set_nc}->($self);
4618 }
4619
4620 return ($self->{ct}); # ENTITY
4621 redo A;
4622 } elsif ($self->{nc} == 0x004E or # N
4623 $self->{nc} == 0x006E) { # n
4624
4625 $self->{state} = NDATA_STATE;
4626 $self->{kwd} = chr $self->{nc};
4627
4628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4629 $self->{line_prev} = $self->{line};
4630 $self->{column_prev} = $self->{column};
4631 $self->{column}++;
4632 $self->{nc}
4633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4634 } else {
4635 $self->{set_nc}->($self);
4636 }
4637
4638 redo A;
4639 } elsif ($self->{nc} == -1) {
4640
4641 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4642 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4643 ## reconsume
4644 return ($self->{ct}); # ENTITY
4645 redo A;
4646 } else {
4647
4648 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4649 $self->{state} = BOGUS_MD_STATE;
4650
4651 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4652 $self->{line_prev} = $self->{line};
4653 $self->{column_prev} = $self->{column};
4654 $self->{column}++;
4655 $self->{nc}
4656 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4657 } else {
4658 $self->{set_nc}->($self);
4659 }
4660
4661 redo A;
4662 }
4663 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
4664 if ($self->{nc} == 0x003E) { # >
4665
4666 $self->{state} = DATA_STATE;
4667 $self->{s_kwd} = '';
4668
4669 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4670 $self->{line_prev} = $self->{line};
4671 $self->{column_prev} = $self->{column};
4672 $self->{column}++;
4673 $self->{nc}
4674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4675 } else {
4676 $self->{set_nc}->($self);
4677 }
4678
4679
4680 return ($self->{ct}); # DOCTYPE
4681
4682 redo A;
4683 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4684
4685 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4686 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4687 $self->{in_subset} = 1;
4688
4689 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4690 $self->{line_prev} = $self->{line};
4691 $self->{column_prev} = $self->{column};
4692 $self->{column}++;
4693 $self->{nc}
4694 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4695 } else {
4696 $self->{set_nc}->($self);
4697 }
4698
4699 return ($self->{ct}); # DOCTYPE
4700 redo A;
4701 } elsif ($self->{nc} == -1) {
4702
4703 $self->{state} = DATA_STATE;
4704 $self->{s_kwd} = '';
4705 ## reconsume
4706
4707 return ($self->{ct}); # DOCTYPE
4708
4709 redo A;
4710 } else {
4711
4712 my $s = '';
4713 $self->{read_until}->($s, q{>[}, 0);
4714
4715 ## Stay in the state
4716
4717 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4718 $self->{line_prev} = $self->{line};
4719 $self->{column_prev} = $self->{column};
4720 $self->{column}++;
4721 $self->{nc}
4722 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4723 } else {
4724 $self->{set_nc}->($self);
4725 }
4726
4727 redo A;
4728 }
4729 } elsif ($self->{state} == CDATA_SECTION_STATE) {
4730 ## NOTE: "CDATA section state" in the state is jointly implemented
4731 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4732 ## and |CDATA_SECTION_MSE2_STATE|.
4733
4734 ## XML5: "CDATA state".
4735
4736 if ($self->{nc} == 0x005D) { # ]
4737
4738 $self->{state} = CDATA_SECTION_MSE1_STATE;
4739
4740 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4741 $self->{line_prev} = $self->{line};
4742 $self->{column_prev} = $self->{column};
4743 $self->{column}++;
4744 $self->{nc}
4745 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4746 } else {
4747 $self->{set_nc}->($self);
4748 }
4749
4750 redo A;
4751 } elsif ($self->{nc} == -1) {
4752 if ($self->{is_xml}) {
4753
4754 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4755 } else {
4756
4757 }
4758
4759 $self->{state} = DATA_STATE;
4760 $self->{s_kwd} = '';
4761 ## Reconsume.
4762 if (length $self->{ct}->{data}) { # character
4763
4764 return ($self->{ct}); # character
4765 } else {
4766
4767 ## No token to emit. $self->{ct} is discarded.
4768 }
4769 redo A;
4770 } else {
4771
4772 $self->{ct}->{data} .= chr $self->{nc};
4773 $self->{read_until}->($self->{ct}->{data},
4774 q<]>,
4775 length $self->{ct}->{data});
4776
4777 ## Stay in the state.
4778
4779 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4780 $self->{line_prev} = $self->{line};
4781 $self->{column_prev} = $self->{column};
4782 $self->{column}++;
4783 $self->{nc}
4784 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4785 } else {
4786 $self->{set_nc}->($self);
4787 }
4788
4789 redo A;
4790 }
4791
4792 ## ISSUE: "text tokens" in spec.
4793 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4794 ## XML5: "CDATA bracket state".
4795
4796 if ($self->{nc} == 0x005D) { # ]
4797
4798 $self->{state} = CDATA_SECTION_MSE2_STATE;
4799
4800 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4801 $self->{line_prev} = $self->{line};
4802 $self->{column_prev} = $self->{column};
4803 $self->{column}++;
4804 $self->{nc}
4805 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4806 } else {
4807 $self->{set_nc}->($self);
4808 }
4809
4810 redo A;
4811 } else {
4812
4813 ## XML5: If EOF, "]" is not appended and changed to the data state.
4814 $self->{ct}->{data} .= ']';
4815 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4816 ## Reconsume.
4817 redo A;
4818 }
4819 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4820 ## XML5: "CDATA end state".
4821
4822 if ($self->{nc} == 0x003E) { # >
4823 $self->{state} = DATA_STATE;
4824 $self->{s_kwd} = '';
4825
4826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4827 $self->{line_prev} = $self->{line};
4828 $self->{column_prev} = $self->{column};
4829 $self->{column}++;
4830 $self->{nc}
4831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4832 } else {
4833 $self->{set_nc}->($self);
4834 }
4835
4836 if (length $self->{ct}->{data}) { # character
4837
4838 return ($self->{ct}); # character
4839 } else {
4840
4841 ## No token to emit. $self->{ct} is discarded.
4842 }
4843 redo A;
4844 } elsif ($self->{nc} == 0x005D) { # ]
4845 # character
4846 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4847 ## Stay in the state.
4848
4849 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4850 $self->{line_prev} = $self->{line};
4851 $self->{column_prev} = $self->{column};
4852 $self->{column}++;
4853 $self->{nc}
4854 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4855 } else {
4856 $self->{set_nc}->($self);
4857 }
4858
4859 redo A;
4860 } else {
4861
4862 $self->{ct}->{data} .= ']]'; # character
4863 $self->{state} = CDATA_SECTION_STATE;
4864 ## Reconsume. ## XML5: Emit.
4865 redo A;
4866 }
4867 } elsif ($self->{state} == ENTITY_STATE) {
4868 if ($is_space->{$self->{nc}} or
4869 {
4870 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4871 $self->{entity_add} => 1,
4872 }->{$self->{nc}}) {
4873 if ($self->{is_xml}) {
4874
4875 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4876 line => $self->{line_prev},
4877 column => $self->{column_prev}
4878 + ($self->{nc} == -1 ? 1 : 0));
4879 } else {
4880
4881 ## No error
4882 }
4883 ## Don't consume
4884 ## Return nothing.
4885 #
4886 } elsif ($self->{nc} == 0x0023) { # #
4887
4888 $self->{state} = ENTITY_HASH_STATE;
4889 $self->{kwd} = '#';
4890
4891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4892 $self->{line_prev} = $self->{line};
4893 $self->{column_prev} = $self->{column};
4894 $self->{column}++;
4895 $self->{nc}
4896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4897 } else {
4898 $self->{set_nc}->($self);
4899 }
4900
4901 redo A;
4902 } elsif ($self->{is_xml} or
4903 (0x0041 <= $self->{nc} and
4904 $self->{nc} <= 0x005A) or # A..Z
4905 (0x0061 <= $self->{nc} and
4906 $self->{nc} <= 0x007A)) { # a..z
4907
4908 require Whatpm::_NamedEntityList;
4909 $self->{state} = ENTITY_NAME_STATE;
4910 $self->{kwd} = chr $self->{nc};
4911 $self->{entity__value} = $self->{kwd};
4912 $self->{entity__match} = 0;
4913
4914 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4915 $self->{line_prev} = $self->{line};
4916 $self->{column_prev} = $self->{column};
4917 $self->{column}++;
4918 $self->{nc}
4919 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4920 } else {
4921 $self->{set_nc}->($self);
4922 }
4923
4924 redo A;
4925 } else {
4926
4927 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
4928 ## Return nothing.
4929 #
4930 }
4931
4932 ## NOTE: No character is consumed by the "consume a character
4933 ## reference" algorithm. In other word, there is an "&" character
4934 ## that does not introduce a character reference, which would be
4935 ## appended to the parent element or the attribute value in later
4936 ## process of the tokenizer.
4937
4938 if ($self->{prev_state} == DATA_STATE) {
4939
4940 $self->{state} = $self->{prev_state};
4941 $self->{s_kwd} = '';
4942 ## Reconsume.
4943 return ({type => CHARACTER_TOKEN, data => '&',
4944 line => $self->{line_prev},
4945 column => $self->{column_prev},
4946 });
4947 redo A;
4948 } else {
4949
4950 $self->{ca}->{value} .= '&';
4951 $self->{state} = $self->{prev_state};
4952 $self->{s_kwd} = '';
4953 ## Reconsume.
4954 redo A;
4955 }
4956 } elsif ($self->{state} == ENTITY_HASH_STATE) {
4957 if ($self->{nc} == 0x0078) { # x
4958
4959 $self->{state} = HEXREF_X_STATE;
4960 $self->{kwd} .= chr $self->{nc};
4961
4962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4963 $self->{line_prev} = $self->{line};
4964 $self->{column_prev} = $self->{column};
4965 $self->{column}++;
4966 $self->{nc}
4967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4968 } else {
4969 $self->{set_nc}->($self);
4970 }
4971
4972 redo A;
4973 } elsif ($self->{nc} == 0x0058) { # X
4974
4975 if ($self->{is_xml}) {
4976 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4977 }
4978 $self->{state} = HEXREF_X_STATE;
4979 $self->{kwd} .= chr $self->{nc};
4980
4981 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4982 $self->{line_prev} = $self->{line};
4983 $self->{column_prev} = $self->{column};
4984 $self->{column}++;
4985 $self->{nc}
4986 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4987 } else {
4988 $self->{set_nc}->($self);
4989 }
4990
4991 redo A;
4992 } elsif (0x0030 <= $self->{nc} and
4993 $self->{nc} <= 0x0039) { # 0..9
4994
4995 $self->{state} = NCR_NUM_STATE;
4996 $self->{kwd} = $self->{nc} - 0x0030;
4997
4998 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4999 $self->{line_prev} = $self->{line};
5000 $self->{column_prev} = $self->{column};
5001 $self->{column}++;
5002 $self->{nc}
5003 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5004 } else {
5005 $self->{set_nc}->($self);
5006 }
5007
5008 redo A;
5009 } else {
5010 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
5011 line => $self->{line_prev},
5012 column => $self->{column_prev} - 1);
5013
5014 ## NOTE: According to the spec algorithm, nothing is returned,
5015 ## and then "&#" is appended to the parent element or the attribute
5016 ## value in the later processing.
5017
5018 if ($self->{prev_state} == DATA_STATE) {
5019
5020 $self->{state} = $self->{prev_state};
5021 $self->{s_kwd} = '';
5022 ## Reconsume.
5023 return ({type => CHARACTER_TOKEN,
5024 data => '&#',
5025 line => $self->{line_prev},
5026 column => $self->{column_prev} - 1,
5027 });
5028 redo A;
5029 } else {
5030
5031 $self->{ca}->{value} .= '&#';
5032 $self->{state} = $self->{prev_state};
5033 $self->{s_kwd} = '';
5034 ## Reconsume.
5035 redo A;
5036 }
5037 }
5038 } elsif ($self->{state} == NCR_NUM_STATE) {
5039 if (0x0030 <= $self->{nc} and
5040 $self->{nc} <= 0x0039) { # 0..9
5041
5042 $self->{kwd} *= 10;
5043 $self->{kwd} += $self->{nc} - 0x0030;
5044
5045 ## Stay in the state.
5046
5047 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5048 $self->{line_prev} = $self->{line};
5049 $self->{column_prev} = $self->{column};
5050 $self->{column}++;
5051 $self->{nc}
5052 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5053 } else {
5054 $self->{set_nc}->($self);
5055 }
5056
5057 redo A;
5058 } elsif ($self->{nc} == 0x003B) { # ;
5059
5060
5061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5062 $self->{line_prev} = $self->{line};
5063 $self->{column_prev} = $self->{column};
5064 $self->{column}++;
5065 $self->{nc}
5066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5067 } else {
5068 $self->{set_nc}->($self);
5069 }
5070
5071 #
5072 } else {
5073
5074 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5075 ## Reconsume.
5076 #
5077 }
5078
5079 my $code = $self->{kwd};
5080 my $l = $self->{line_prev};
5081 my $c = $self->{column_prev};
5082 if ((not $self->{is_xml} and $charref_map->{$code}) or
5083 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5084 ($self->{is_xml} and $code == 0x0000)) {
5085
5086 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5087 text => (sprintf 'U+%04X', $code),
5088 line => $l, column => $c);
5089 $code = $charref_map->{$code};
5090 } elsif ($code > 0x10FFFF) {
5091
5092 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5093 text => (sprintf 'U-%08X', $code),
5094 line => $l, column => $c);
5095 $code = 0xFFFD;
5096 }
5097
5098 if ($self->{prev_state} == DATA_STATE) {
5099
5100 $self->{state} = $self->{prev_state};
5101 $self->{s_kwd} = '';
5102 ## Reconsume.
5103 return ({type => CHARACTER_TOKEN, data => chr $code,
5104 has_reference => 1,
5105 line => $l, column => $c,
5106 });
5107 redo A;
5108 } else {
5109
5110 $self->{ca}->{value} .= chr $code;
5111 $self->{ca}->{has_reference} = 1;
5112 $self->{state} = $self->{prev_state};
5113 $self->{s_kwd} = '';
5114 ## Reconsume.
5115 redo A;
5116 }
5117 } elsif ($self->{state} == HEXREF_X_STATE) {
5118 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
5119 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
5120 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
5121 # 0..9, A..F, a..f
5122
5123 $self->{state} = HEXREF_HEX_STATE;
5124 $self->{kwd} = 0;
5125 ## Reconsume.
5126 redo A;
5127 } else {
5128 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
5129 line => $self->{line_prev},
5130 column => $self->{column_prev} - 2);
5131
5132 ## NOTE: According to the spec algorithm, nothing is returned,
5133 ## and then "&#" followed by "X" or "x" is appended to the parent
5134 ## element or the attribute value in the later processing.
5135
5136 if ($self->{prev_state} == DATA_STATE) {
5137
5138 $self->{state} = $self->{prev_state};
5139 $self->{s_kwd} = '';
5140 ## Reconsume.
5141 return ({type => CHARACTER_TOKEN,
5142 data => '&' . $self->{kwd},
5143 line => $self->{line_prev},
5144 column => $self->{column_prev} - length $self->{kwd},
5145 });
5146 redo A;
5147 } else {
5148
5149 $self->{ca}->{value} .= '&' . $self->{kwd};
5150 $self->{state} = $self->{prev_state};
5151 $self->{s_kwd} = '';
5152 ## Reconsume.
5153 redo A;
5154 }
5155 }
5156 } elsif ($self->{state} == HEXREF_HEX_STATE) {
5157 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5158 # 0..9
5159
5160 $self->{kwd} *= 0x10;
5161 $self->{kwd} += $self->{nc} - 0x0030;
5162 ## Stay in the state.
5163
5164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5165 $self->{line_prev} = $self->{line};
5166 $self->{column_prev} = $self->{column};
5167 $self->{column}++;
5168 $self->{nc}
5169 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5170 } else {
5171 $self->{set_nc}->($self);
5172 }
5173
5174 redo A;
5175 } elsif (0x0061 <= $self->{nc} and
5176 $self->{nc} <= 0x0066) { # a..f
5177
5178 $self->{kwd} *= 0x10;
5179 $self->{kwd} += $self->{nc} - 0x0060 + 9;
5180 ## Stay in the state.
5181
5182 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5183 $self->{line_prev} = $self->{line};
5184 $self->{column_prev} = $self->{column};
5185 $self->{column}++;
5186 $self->{nc}
5187 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5188 } else {
5189 $self->{set_nc}->($self);
5190 }
5191
5192 redo A;
5193 } elsif (0x0041 <= $self->{nc} and
5194 $self->{nc} <= 0x0046) { # A..F
5195
5196 $self->{kwd} *= 0x10;
5197 $self->{kwd} += $self->{nc} - 0x0040 + 9;
5198 ## Stay in the state.
5199
5200 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5201 $self->{line_prev} = $self->{line};
5202 $self->{column_prev} = $self->{column};
5203 $self->{column}++;
5204 $self->{nc}
5205 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5206 } else {
5207 $self->{set_nc}->($self);
5208 }
5209
5210 redo A;
5211 } elsif ($self->{nc} == 0x003B) { # ;
5212
5213
5214 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5215 $self->{line_prev} = $self->{line};
5216 $self->{column_prev} = $self->{column};
5217 $self->{column}++;
5218 $self->{nc}
5219 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5220 } else {
5221 $self->{set_nc}->($self);
5222 }
5223
5224 #
5225 } else {
5226
5227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5228 line => $self->{line},
5229 column => $self->{column});
5230 ## Reconsume.
5231 #
5232 }
5233
5234 my $code = $self->{kwd};
5235 my $l = $self->{line_prev};
5236 my $c = $self->{column_prev};
5237 if ((not $self->{is_xml} and $charref_map->{$code}) or
5238 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5239 ($self->{is_xml} and $code == 0x0000)) {
5240
5241 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5242 text => (sprintf 'U+%04X', $code),
5243 line => $l, column => $c);
5244 $code = $charref_map->{$code};
5245 } elsif ($code > 0x10FFFF) {
5246
5247 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5248 text => (sprintf 'U-%08X', $code),
5249 line => $l, column => $c);
5250 $code = 0xFFFD;
5251 }
5252
5253 if ($self->{prev_state} == DATA_STATE) {
5254
5255 $self->{state} = $self->{prev_state};
5256 $self->{s_kwd} = '';
5257 ## Reconsume.
5258 return ({type => CHARACTER_TOKEN, data => chr $code,
5259 has_reference => 1,
5260 line => $l, column => $c,
5261 });
5262 redo A;
5263 } else {
5264
5265 $self->{ca}->{value} .= chr $code;
5266 $self->{ca}->{has_reference} = 1;
5267 $self->{state} = $self->{prev_state};
5268 $self->{s_kwd} = '';
5269 ## Reconsume.
5270 redo A;
5271 }
5272 } elsif ($self->{state} == ENTITY_NAME_STATE) {
5273 if ((0x0041 <= $self->{nc} and # a
5274 $self->{nc} <= 0x005A) or # x
5275 (0x0061 <= $self->{nc} and # a
5276 $self->{nc} <= 0x007A) or # z
5277 (0x0030 <= $self->{nc} and # 0
5278 $self->{nc} <= 0x0039) or # 9
5279 $self->{nc} == 0x003B or # ;
5280 ($self->{is_xml} and
5281 not ($is_space->{$self->{nc}} or
5282 {
5283 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5284 $self->{entity_add} => 1,
5285 }->{$self->{nc}}))) {
5286 our $EntityChar;
5287 $self->{kwd} .= chr $self->{nc};
5288 if (defined $EntityChar->{$self->{kwd}} or
5289 $self->{ge}->{$self->{kwd}}) {
5290 if ($self->{nc} == 0x003B) { # ;
5291 if (defined $self->{ge}->{$self->{kwd}}) {
5292 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5293
5294 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5295 } else {
5296 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5297
5298 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5299 value => $self->{kwd});
5300 } else {
5301
5302 }
5303 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5304 }
5305 } else {
5306 if ($self->{is_xml}) {
5307
5308 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5309 value => $self->{kwd},
5310 level => {
5311 'amp;' => $self->{level}->{warn},
5312 'quot;' => $self->{level}->{warn},
5313 'lt;' => $self->{level}->{warn},
5314 'gt;' => $self->{level}->{warn},
5315 'apos;' => $self->{level}->{warn},
5316 }->{$self->{kwd}} ||
5317 $self->{level}->{must});
5318 } else {
5319
5320 }
5321 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5322 }
5323 $self->{entity__match} = 1;
5324
5325 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326 $self->{line_prev} = $self->{line};
5327 $self->{column_prev} = $self->{column};
5328 $self->{column}++;
5329 $self->{nc}
5330 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331 } else {
5332 $self->{set_nc}->($self);
5333 }
5334
5335 #
5336 } else {
5337
5338 $self->{entity__value} = $EntityChar->{$self->{kwd}};
5339 $self->{entity__match} = -1;
5340 ## Stay in the state.
5341
5342 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5343 $self->{line_prev} = $self->{line};
5344 $self->{column_prev} = $self->{column};
5345 $self->{column}++;
5346 $self->{nc}
5347 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5348 } else {
5349 $self->{set_nc}->($self);
5350 }
5351
5352 redo A;
5353 }
5354 } else {
5355
5356 $self->{entity__value} .= chr $self->{nc};
5357 $self->{entity__match} *= 2;
5358 ## Stay in the state.
5359
5360 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5361 $self->{line_prev} = $self->{line};
5362 $self->{column_prev} = $self->{column};
5363 $self->{column}++;
5364 $self->{nc}
5365 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5366 } else {
5367 $self->{set_nc}->($self);
5368 }
5369
5370 redo A;
5371 }
5372 }
5373
5374 my $data;
5375 my $has_ref;
5376 if ($self->{entity__match} > 0) {
5377
5378 $data = $self->{entity__value};
5379 $has_ref = 1;
5380 #
5381 } elsif ($self->{entity__match} < 0) {
5382 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5383 if ($self->{prev_state} != DATA_STATE and # in attribute
5384 $self->{entity__match} < -1) {
5385
5386 $data = '&' . $self->{kwd};
5387 #
5388 } else {
5389
5390 $data = $self->{entity__value};
5391 $has_ref = 1;
5392 #
5393 }
5394 } else {
5395
5396 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5397 line => $self->{line_prev},
5398 column => $self->{column_prev} - length $self->{kwd});
5399 $data = '&' . $self->{kwd};
5400 #
5401 }
5402
5403 ## NOTE: In these cases, when a character reference is found,
5404 ## it is consumed and a character token is returned, or, otherwise,
5405 ## nothing is consumed and returned, according to the spec algorithm.
5406 ## In this implementation, anything that has been examined by the
5407 ## tokenizer is appended to the parent element or the attribute value
5408 ## as string, either literal string when no character reference or
5409 ## entity-replaced string otherwise, in this stage, since any characters
5410 ## that would not be consumed are appended in the data state or in an
5411 ## appropriate attribute value state anyway.
5412
5413 if ($self->{prev_state} == DATA_STATE) {
5414
5415 $self->{state} = $self->{prev_state};
5416 $self->{s_kwd} = '';
5417 ## Reconsume.
5418 return ({type => CHARACTER_TOKEN,
5419 data => $data,
5420 has_reference => $has_ref,
5421 line => $self->{line_prev},
5422 column => $self->{column_prev} + 1 - length $self->{kwd},
5423 });
5424 redo A;
5425 } else {
5426
5427 $self->{ca}->{value} .= $data;
5428 $self->{ca}->{has_reference} = 1 if $has_ref;
5429 $self->{state} = $self->{prev_state};
5430 $self->{s_kwd} = '';
5431 ## Reconsume.
5432 redo A;
5433 }
5434
5435 ## XML-only states
5436
5437 } elsif ($self->{state} == PI_STATE) {
5438 ## XML5: "Pi state" and "DOCTYPE pi state".
5439
5440 if ($is_space->{$self->{nc}} or
5441 $self->{nc} == 0x003F or # ?
5442 $self->{nc} == -1) {
5443 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5444 ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5445 ## "DOCTYPE pi state": Parse error, switch to the "data
5446 ## state".
5447 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5448 line => $self->{line_prev},
5449 column => $self->{column_prev}
5450 - 1 * ($self->{nc} != -1));
5451 $self->{state} = BOGUS_COMMENT_STATE;
5452 ## Reconsume.
5453 $self->{ct} = {type => COMMENT_TOKEN,
5454 data => '?',
5455 line => $self->{line_prev},
5456 column => $self->{column_prev}
5457 - 1 * ($self->{nc} != -1),
5458 };
5459 redo A;
5460 } else {
5461 ## XML5: "DOCTYPE pi state": Stay in the state.
5462 $self->{ct} = {type => PI_TOKEN,
5463 target => chr $self->{nc},
5464 data => '',
5465 line => $self->{line_prev},
5466 column => $self->{column_prev} - 1,
5467 };
5468 $self->{state} = PI_TARGET_STATE;
5469
5470 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5471 $self->{line_prev} = $self->{line};
5472 $self->{column_prev} = $self->{column};
5473 $self->{column}++;
5474 $self->{nc}
5475 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5476 } else {
5477 $self->{set_nc}->($self);
5478 }
5479
5480 redo A;
5481 }
5482 } elsif ($self->{state} == PI_TARGET_STATE) {
5483 if ($is_space->{$self->{nc}}) {
5484 $self->{state} = PI_TARGET_AFTER_STATE;
5485
5486 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5487 $self->{line_prev} = $self->{line};
5488 $self->{column_prev} = $self->{column};
5489 $self->{column}++;
5490 $self->{nc}
5491 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5492 } else {
5493 $self->{set_nc}->($self);
5494 }
5495
5496 redo A;
5497 } elsif ($self->{nc} == -1) {
5498 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5499 if ($self->{in_subset}) {
5500 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5501 } else {
5502 $self->{state} = DATA_STATE;
5503 $self->{s_kwd} = '';
5504 }
5505 ## Reconsume.
5506 return ($self->{ct}); # pi
5507 redo A;
5508 } elsif ($self->{nc} == 0x003F) { # ?
5509 $self->{state} = PI_AFTER_STATE;
5510
5511 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5512 $self->{line_prev} = $self->{line};
5513 $self->{column_prev} = $self->{column};
5514 $self->{column}++;
5515 $self->{nc}
5516 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5517 } else {
5518 $self->{set_nc}->($self);
5519 }
5520
5521 redo A;
5522 } else {
5523 ## XML5: typo ("tag name" -> "target")
5524 $self->{ct}->{target} .= chr $self->{nc}; # pi
5525
5526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5527 $self->{line_prev} = $self->{line};
5528 $self->{column_prev} = $self->{column};
5529 $self->{column}++;
5530 $self->{nc}
5531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5532 } else {
5533 $self->{set_nc}->($self);
5534 }
5535
5536 redo A;
5537 }
5538 } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5539 if ($is_space->{$self->{nc}}) {
5540 ## Stay in the state.
5541
5542 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5543 $self->{line_prev} = $self->{line};
5544 $self->{column_prev} = $self->{column};
5545 $self->{column}++;
5546 $self->{nc}
5547 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5548 } else {
5549 $self->{set_nc}->($self);
5550 }
5551
5552 redo A;
5553 } else {
5554 $self->{state} = PI_DATA_STATE;
5555 ## Reprocess.
5556 redo A;
5557 }
5558 } elsif ($self->{state} == PI_DATA_STATE) {
5559 if ($self->{nc} == 0x003F) { # ?
5560 $self->{state} = PI_DATA_AFTER_STATE;
5561
5562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5563 $self->{line_prev} = $self->{line};
5564 $self->{column_prev} = $self->{column};
5565 $self->{column}++;
5566 $self->{nc}
5567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5568 } else {
5569 $self->{set_nc}->($self);
5570 }
5571
5572 redo A;
5573 } elsif ($self->{nc} == -1) {
5574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5575 if ($self->{in_subset}) {
5576 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5577 } else {
5578 $self->{state} = DATA_STATE;
5579 $self->{s_kwd} = '';
5580 }
5581 ## Reprocess.
5582 return ($self->{ct}); # pi
5583 redo A;
5584 } else {
5585 $self->{ct}->{data} .= chr $self->{nc}; # pi
5586 $self->{read_until}->($self->{ct}->{data}, q[?],
5587 length $self->{ct}->{data});
5588 ## Stay in the state.
5589
5590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5591 $self->{line_prev} = $self->{line};
5592 $self->{column_prev} = $self->{column};
5593 $self->{column}++;
5594 $self->{nc}
5595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5596 } else {
5597 $self->{set_nc}->($self);
5598 }
5599
5600 ## Reprocess.
5601 redo A;
5602 }
5603 } elsif ($self->{state} == PI_AFTER_STATE) {
5604 ## XML5: Part of "Pi after state".
5605
5606 if ($self->{nc} == 0x003E) { # >
5607 if ($self->{in_subset}) {
5608 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5609 } else {
5610 $self->{state} = DATA_STATE;
5611 $self->{s_kwd} = '';
5612 }
5613
5614 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5615 $self->{line_prev} = $self->{line};
5616 $self->{column_prev} = $self->{column};
5617 $self->{column}++;
5618 $self->{nc}
5619 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5620 } else {
5621 $self->{set_nc}->($self);
5622 }
5623
5624 return ($self->{ct}); # pi
5625 redo A;
5626 } elsif ($self->{nc} == 0x003F) { # ?
5627 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5628 line => $self->{line_prev},
5629 column => $self->{column_prev}); ## XML5: no error
5630 $self->{ct}->{data} .= '?';
5631 $self->{state} = PI_DATA_AFTER_STATE;
5632
5633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634 $self->{line_prev} = $self->{line};
5635 $self->{column_prev} = $self->{column};
5636 $self->{column}++;
5637 $self->{nc}
5638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639 } else {
5640 $self->{set_nc}->($self);
5641 }
5642
5643 redo A;
5644 } else {
5645 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5646 line => $self->{line_prev},
5647 column => $self->{column_prev}
5648 + 1 * ($self->{nc} == -1)); ## XML5: no error
5649 $self->{ct}->{data} .= '?'; ## XML5: not appended
5650 $self->{state} = PI_DATA_STATE;
5651 ## Reprocess.
5652 redo A;
5653 }
5654 } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5655 ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5656
5657 if ($self->{nc} == 0x003E) { # >
5658 if ($self->{in_subset}) {
5659 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5660 } else {
5661 $self->{state} = DATA_STATE;
5662 $self->{s_kwd} = '';
5663 }
5664
5665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5666 $self->{line_prev} = $self->{line};
5667 $self->{column_prev} = $self->{column};
5668 $self->{column}++;
5669 $self->{nc}
5670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5671 } else {
5672 $self->{set_nc}->($self);
5673 }
5674
5675 return ($self->{ct}); # pi
5676 redo A;
5677 } elsif ($self->{nc} == 0x003F) { # ?
5678 $self->{ct}->{data} .= '?';
5679 ## Stay in the state.
5680
5681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5682 $self->{line_prev} = $self->{line};
5683 $self->{column_prev} = $self->{column};
5684 $self->{column}++;
5685 $self->{nc}
5686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5687 } else {
5688 $self->{set_nc}->($self);
5689 }
5690
5691 redo A;
5692 } else {
5693 $self->{ct}->{data} .= '?'; ## XML5: not appended
5694 $self->{state} = PI_DATA_STATE;
5695 ## Reprocess.
5696 redo A;
5697 }
5698
5699 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5700 if ($self->{nc} == 0x003C) { # <
5701 $self->{state} = DOCTYPE_TAG_STATE;
5702
5703 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5704 $self->{line_prev} = $self->{line};
5705 $self->{column_prev} = $self->{column};
5706 $self->{column}++;
5707 $self->{nc}
5708 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5709 } else {
5710 $self->{set_nc}->($self);
5711 }
5712
5713 redo A;
5714 } elsif ($self->{nc} == 0x0025) { # %
5715 ## XML5: Not defined yet.
5716
5717 ## TODO:
5718
5719 if (not $self->{stop_processing} and
5720 not $self->{document}->xml_standalone) {
5721 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5722 level => $self->{level}->{info});
5723 $self->{stop_processing} = 1;
5724 }
5725
5726
5727 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5728 $self->{line_prev} = $self->{line};
5729 $self->{column_prev} = $self->{column};
5730 $self->{column}++;
5731 $self->{nc}
5732 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5733 } else {
5734 $self->{set_nc}->($self);
5735 }
5736
5737 redo A;
5738 } elsif ($self->{nc} == 0x005D) { # ]
5739 delete $self->{in_subset};
5740 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5741
5742 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5743 $self->{line_prev} = $self->{line};
5744 $self->{column_prev} = $self->{column};
5745 $self->{column}++;
5746 $self->{nc}
5747 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5748 } else {
5749 $self->{set_nc}->($self);
5750 }
5751
5752 redo A;
5753 } elsif ($is_space->{$self->{nc}}) {
5754 ## Stay in the state.
5755
5756 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5757 $self->{line_prev} = $self->{line};
5758 $self->{column_prev} = $self->{column};
5759 $self->{column}++;
5760 $self->{nc}
5761 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5762 } else {
5763 $self->{set_nc}->($self);
5764 }
5765
5766 redo A;
5767 } elsif ($self->{nc} == -1) {
5768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5769 delete $self->{in_subset};
5770 $self->{state} = DATA_STATE;
5771 $self->{s_kwd} = '';
5772 ## Reconsume.
5773 return ({type => END_OF_DOCTYPE_TOKEN});
5774 redo A;
5775 } else {
5776 unless ($self->{internal_subset_tainted}) {
5777 ## XML5: No parse error.
5778 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5779 $self->{internal_subset_tainted} = 1;
5780 }
5781 ## Stay in the state.
5782
5783 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5784 $self->{line_prev} = $self->{line};
5785 $self->{column_prev} = $self->{column};
5786 $self->{column}++;
5787 $self->{nc}
5788 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5789 } else {
5790 $self->{set_nc}->($self);
5791 }
5792
5793 redo A;
5794 }
5795 } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5796 if ($self->{nc} == 0x003E) { # >
5797 $self->{state} = DATA_STATE;
5798 $self->{s_kwd} = '';
5799
5800 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5801 $self->{line_prev} = $self->{line};
5802 $self->{column_prev} = $self->{column};
5803 $self->{column}++;
5804 $self->{nc}
5805 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5806 } else {
5807 $self->{set_nc}->($self);
5808 }
5809
5810 return ({type => END_OF_DOCTYPE_TOKEN});
5811 redo A;
5812 } elsif ($self->{nc} == -1) {
5813 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5814 $self->{state} = DATA_STATE;
5815 $self->{s_kwd} = '';
5816 ## Reconsume.
5817 return ({type => END_OF_DOCTYPE_TOKEN});
5818 redo A;
5819 } else {
5820 ## XML5: No parse error and stay in the state.
5821 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5822
5823 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5824
5825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5826 $self->{line_prev} = $self->{line};
5827 $self->{column_prev} = $self->{column};
5828 $self->{column}++;
5829 $self->{nc}
5830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5831 } else {
5832 $self->{set_nc}->($self);
5833 }
5834
5835 redo A;
5836 }
5837 } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5838 if ($self->{nc} == 0x003E) { # >
5839 $self->{state} = DATA_STATE;
5840 $self->{s_kwd} = '';
5841
5842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5843 $self->{line_prev} = $self->{line};
5844 $self->{column_prev} = $self->{column};
5845 $self->{column}++;
5846 $self->{nc}
5847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5848 } else {
5849 $self->{set_nc}->($self);
5850 }
5851
5852 return ({type => END_OF_DOCTYPE_TOKEN});
5853 redo A;
5854 } elsif ($self->{nc} == -1) {
5855 $self->{state} = DATA_STATE;
5856 $self->{s_kwd} = '';
5857 ## Reconsume.
5858 return ({type => END_OF_DOCTYPE_TOKEN});
5859 redo A;
5860 } else {
5861 ## Stay in the state.
5862
5863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5864 $self->{line_prev} = $self->{line};
5865 $self->{column_prev} = $self->{column};
5866 $self->{column}++;
5867 $self->{nc}
5868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5869 } else {
5870 $self->{set_nc}->($self);
5871 }
5872
5873 redo A;
5874 }
5875 } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5876 if ($self->{nc} == 0x0021) { # !
5877 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5878
5879 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5880 $self->{line_prev} = $self->{line};
5881 $self->{column_prev} = $self->{column};
5882 $self->{column}++;
5883 $self->{nc}
5884 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5885 } else {
5886 $self->{set_nc}->($self);
5887 }
5888
5889 redo A;
5890 } elsif ($self->{nc} == 0x003F) { # ?
5891 $self->{state} = PI_STATE;
5892
5893 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5894 $self->{line_prev} = $self->{line};
5895 $self->{column_prev} = $self->{column};
5896 $self->{column}++;
5897 $self->{nc}
5898 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5899 } else {
5900 $self->{set_nc}->($self);
5901 }
5902
5903 redo A;
5904 } elsif ($self->{nc} == -1) {
5905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5906 $self->{state} = DATA_STATE;
5907 $self->{s_kwd} = '';
5908 ## Reconsume.
5909 redo A;
5910 } else {
5911 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5912 line => $self->{line_prev},
5913 column => $self->{column_prev});
5914 $self->{state} = BOGUS_COMMENT_STATE;
5915 $self->{ct} = {type => COMMENT_TOKEN,
5916 data => '',
5917 }; ## NOTE: Will be discarded.
5918
5919 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5920 $self->{line_prev} = $self->{line};
5921 $self->{column_prev} = $self->{column};
5922 $self->{column}++;
5923 $self->{nc}
5924 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5925 } else {
5926 $self->{set_nc}->($self);
5927 }
5928
5929 redo A;
5930 }
5931 } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5932 ## XML5: "DOCTYPE markup declaration state".
5933
5934 if ($self->{nc} == 0x002D) { # -
5935 $self->{state} = MD_HYPHEN_STATE;
5936
5937 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938 $self->{line_prev} = $self->{line};
5939 $self->{column_prev} = $self->{column};
5940 $self->{column}++;
5941 $self->{nc}
5942 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943 } else {
5944 $self->{set_nc}->($self);
5945 }
5946
5947 redo A;
5948 } elsif ($self->{nc} == 0x0045 or # E
5949 $self->{nc} == 0x0065) { # e
5950 $self->{state} = MD_E_STATE;
5951 $self->{kwd} = chr $self->{nc};
5952
5953 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5954 $self->{line_prev} = $self->{line};
5955 $self->{column_prev} = $self->{column};
5956 $self->{column}++;
5957 $self->{nc}
5958 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5959 } else {
5960 $self->{set_nc}->($self);
5961 }
5962
5963 redo A;
5964 } elsif ($self->{nc} == 0x0041 or # A
5965 $self->{nc} == 0x0061) { # a
5966 $self->{state} = MD_ATTLIST_STATE;
5967 $self->{kwd} = chr $self->{nc};
5968
5969 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5970 $self->{line_prev} = $self->{line};
5971 $self->{column_prev} = $self->{column};
5972 $self->{column}++;
5973 $self->{nc}
5974 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5975 } else {
5976 $self->{set_nc}->($self);
5977 }
5978
5979 redo A;
5980 } elsif ($self->{nc} == 0x004E or # N
5981 $self->{nc} == 0x006E) { # n
5982 $self->{state} = MD_NOTATION_STATE;
5983 $self->{kwd} = chr $self->{nc};
5984
5985 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5986 $self->{line_prev} = $self->{line};
5987 $self->{column_prev} = $self->{column};
5988 $self->{column}++;
5989 $self->{nc}
5990 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5991 } else {
5992 $self->{set_nc}->($self);
5993 }
5994
5995 redo A;
5996 } else {
5997 #
5998 }
5999
6000 ## XML5: No parse error.
6001 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6002 line => $self->{line_prev},
6003 column => $self->{column_prev} - 1);
6004 ## Reconsume.
6005 $self->{state} = BOGUS_COMMENT_STATE;
6006 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
6007 redo A;
6008 } elsif ($self->{state} == MD_E_STATE) {
6009 if ($self->{nc} == 0x004E or # N
6010 $self->{nc} == 0x006E) { # n
6011 $self->{state} = MD_ENTITY_STATE;
6012 $self->{kwd} .= chr $self->{nc};
6013
6014 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6015 $self->{line_prev} = $self->{line};
6016 $self->{column_prev} = $self->{column};
6017 $self->{column}++;
6018 $self->{nc}
6019 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6020 } else {
6021 $self->{set_nc}->($self);
6022 }
6023
6024 redo A;
6025 } elsif ($self->{nc} == 0x004C or # L
6026 $self->{nc} == 0x006C) { # l
6027 ## XML5: <!ELEMENT> not supported.
6028 $self->{state} = MD_ELEMENT_STATE;
6029 $self->{kwd} .= chr $self->{nc};
6030
6031 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032 $self->{line_prev} = $self->{line};
6033 $self->{column_prev} = $self->{column};
6034 $self->{column}++;
6035 $self->{nc}
6036 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037 } else {
6038 $self->{set_nc}->($self);
6039 }
6040
6041 redo A;
6042 } else {
6043 ## XML5: No parse error.
6044 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6045 line => $self->{line_prev},
6046 column => $self->{column_prev} - 2
6047 + 1 * ($self->{nc} == -1));
6048 ## Reconsume.
6049 $self->{state} = BOGUS_COMMENT_STATE;
6050 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6051 redo A;
6052 }
6053 } elsif ($self->{state} == MD_ENTITY_STATE) {
6054 if ($self->{nc} == [
6055 undef,
6056 undef,
6057 0x0054, # T
6058 0x0049, # I
6059 0x0054, # T
6060 ]->[length $self->{kwd}] or
6061 $self->{nc} == [
6062 undef,
6063 undef,
6064 0x0074, # t
6065 0x0069, # i
6066 0x0074, # t
6067 ]->[length $self->{kwd}]) {
6068 ## Stay in the state.
6069 $self->{kwd} .= chr $self->{nc};
6070
6071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072 $self->{line_prev} = $self->{line};
6073 $self->{column_prev} = $self->{column};
6074 $self->{column}++;
6075 $self->{nc}
6076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077 } else {
6078 $self->{set_nc}->($self);
6079 }
6080
6081 redo A;
6082 } elsif ((length $self->{kwd}) == 5 and
6083 ($self->{nc} == 0x0059 or # Y
6084 $self->{nc} == 0x0079)) { # y
6085 if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6086 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6087 text => 'ENTITY',
6088 line => $self->{line_prev},
6089 column => $self->{column_prev} - 4);
6090 }
6091 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6092 line => $self->{line_prev},
6093 column => $self->{column_prev} - 6};
6094 $self->{state} = DOCTYPE_MD_STATE;
6095
6096 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6097 $self->{line_prev} = $self->{line};
6098 $self->{column_prev} = $self->{column};
6099 $self->{column}++;
6100 $self->{nc}
6101 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6102 } else {
6103 $self->{set_nc}->($self);
6104 }
6105
6106 redo A;
6107 } else {
6108 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6109 line => $self->{line_prev},
6110 column => $self->{column_prev} - 1
6111 - (length $self->{kwd})
6112 + 1 * ($self->{nc} == -1));
6113 $self->{state} = BOGUS_COMMENT_STATE;
6114 ## Reconsume.
6115 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6116 redo A;
6117 }
6118 } elsif ($self->{state} == MD_ELEMENT_STATE) {
6119 if ($self->{nc} == [
6120 undef,
6121 undef,
6122 0x0045, # E
6123 0x004D, # M
6124 0x0045, # E
6125 0x004E, # N
6126 ]->[length $self->{kwd}] or
6127 $self->{nc} == [
6128 undef,
6129 undef,
6130 0x0065, # e
6131 0x006D, # m
6132 0x0065, # e
6133 0x006E, # n
6134 ]->[length $self->{kwd}]) {
6135 ## Stay in the state.
6136 $self->{kwd} .= chr $self->{nc};
6137
6138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139 $self->{line_prev} = $self->{line};
6140 $self->{column_prev} = $self->{column};
6141 $self->{column}++;
6142 $self->{nc}
6143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144 } else {
6145 $self->{set_nc}->($self);
6146 }
6147
6148 redo A;
6149 } elsif ((length $self->{kwd}) == 6 and
6150 ($self->{nc} == 0x0054 or # T
6151 $self->{nc} == 0x0074)) { # t
6152 if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6153 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6154 text => 'ELEMENT',
6155 line => $self->{line_prev},
6156 column => $self->{column_prev} - 5);
6157 }
6158 $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6159 line => $self->{line_prev},
6160 column => $self->{column_prev} - 7};
6161 $self->{state} = DOCTYPE_MD_STATE;
6162
6163 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6164 $self->{line_prev} = $self->{line};
6165 $self->{column_prev} = $self->{column};
6166 $self->{column}++;
6167 $self->{nc}
6168 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6169 } else {
6170 $self->{set_nc}->($self);
6171 }
6172
6173 redo A;
6174 } else {
6175 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6176 line => $self->{line_prev},
6177 column => $self->{column_prev} - 1
6178 - (length $self->{kwd})
6179 + 1 * ($self->{nc} == -1));
6180 $self->{state} = BOGUS_COMMENT_STATE;
6181 ## Reconsume.
6182 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6183 redo A;
6184 }
6185 } elsif ($self->{state} == MD_ATTLIST_STATE) {
6186 if ($self->{nc} == [
6187 undef,
6188 0x0054, # T
6189 0x0054, # T
6190 0x004C, # L
6191 0x0049, # I
6192 0x0053, # S
6193 ]->[length $self->{kwd}] or
6194 $self->{nc} == [
6195 undef,
6196 0x0074, # t
6197 0x0074, # t
6198 0x006C, # l
6199 0x0069, # i
6200 0x0073, # s
6201 ]->[length $self->{kwd}]) {
6202 ## Stay in the state.
6203 $self->{kwd} .= chr $self->{nc};
6204
6205 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6206 $self->{line_prev} = $self->{line};
6207 $self->{column_prev} = $self->{column};
6208 $self->{column}++;
6209 $self->{nc}
6210 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6211 } else {
6212 $self->{set_nc}->($self);
6213 }
6214
6215 redo A;
6216 } elsif ((length $self->{kwd}) == 6 and
6217 ($self->{nc} == 0x0054 or # T
6218 $self->{nc} == 0x0074)) { # t
6219 if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6220 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6221 text => 'ATTLIST',
6222 line => $self->{line_prev},
6223 column => $self->{column_prev} - 5);
6224 }
6225 $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6226 attrdefs => [],
6227 line => $self->{line_prev},
6228 column => $self->{column_prev} - 7};
6229 $self->{state} = DOCTYPE_MD_STATE;
6230
6231 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6232 $self->{line_prev} = $self->{line};
6233 $self->{column_prev} = $self->{column};
6234 $self->{column}++;
6235 $self->{nc}
6236 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6237 } else {
6238 $self->{set_nc}->($self);
6239 }
6240
6241 redo A;
6242 } else {
6243 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6244 line => $self->{line_prev},
6245 column => $self->{column_prev} - 1
6246 - (length $self->{kwd})
6247 + 1 * ($self->{nc} == -1));
6248 $self->{state} = BOGUS_COMMENT_STATE;
6249 ## Reconsume.
6250 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6251 redo A;
6252 }
6253 } elsif ($self->{state} == MD_NOTATION_STATE) {
6254 if ($self->{nc} == [
6255 undef,
6256 0x004F, # O
6257 0x0054, # T
6258 0x0041, # A
6259 0x0054, # T
6260 0x0049, # I
6261 0x004F, # O
6262 ]->[length $self->{kwd}] or
6263 $self->{nc} == [
6264 undef,
6265 0x006F, # o
6266 0x0074, # t
6267 0x0061, # a
6268 0x0074, # t
6269 0x0069, # i
6270 0x006F, # o
6271 ]->[length $self->{kwd}]) {
6272 ## Stay in the state.
6273 $self->{kwd} .= chr $self->{nc};
6274
6275 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6276 $self->{line_prev} = $self->{line};
6277 $self->{column_prev} = $self->{column};
6278 $self->{column}++;
6279 $self->{nc}
6280 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6281 } else {
6282 $self->{set_nc}->($self);
6283 }
6284
6285 redo A;
6286 } elsif ((length $self->{kwd}) == 7 and
6287 ($self->{nc} == 0x004E or # N
6288 $self->{nc} == 0x006E)) { # n
6289 if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6290 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6291 text => 'NOTATION',
6292 line => $self->{line_prev},
6293 column => $self->{column_prev} - 6);
6294 }
6295 $self->{ct} = {type => NOTATION_TOKEN, name => '',
6296 line => $self->{line_prev},
6297 column => $self->{column_prev} - 8};
6298 $self->{state} = DOCTYPE_MD_STATE;
6299
6300 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6301 $self->{line_prev} = $self->{line};
6302 $self->{column_prev} = $self->{column};
6303 $self->{column}++;
6304 $self->{nc}
6305 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6306 } else {
6307 $self->{set_nc}->($self);
6308 }
6309
6310 redo A;
6311 } else {
6312 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6313 line => $self->{line_prev},
6314 column => $self->{column_prev} - 1
6315 - (length $self->{kwd})
6316 + 1 * ($self->{nc} == -1));
6317 $self->{state} = BOGUS_COMMENT_STATE;
6318 ## Reconsume.
6319 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6320 redo A;
6321 }
6322 } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6323 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6324 ## "DOCTYPE NOTATION state".
6325
6326 if ($is_space->{$self->{nc}}) {
6327 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6328 $self->{state} = BEFORE_MD_NAME_STATE;
6329
6330 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6331 $self->{line_prev} = $self->{line};
6332 $self->{column_prev} = $self->{column};
6333 $self->{column}++;
6334 $self->{nc}
6335 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6336 } else {
6337 $self->{set_nc}->($self);
6338 }
6339
6340 redo A;
6341 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6342 $self->{nc} == 0x0025) { # %
6343 ## XML5: Switch to the "DOCTYPE bogus comment state".
6344 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6345 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6346
6347 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6348 $self->{line_prev} = $self->{line};
6349 $self->{column_prev} = $self->{column};
6350 $self->{column}++;
6351 $self->{nc}
6352 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6353 } else {
6354 $self->{set_nc}->($self);
6355 }
6356
6357 redo A;
6358 } elsif ($self->{nc} == -1) {
6359 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6360 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6361 ## Reconsume.
6362 redo A;
6363 } elsif ($self->{nc} == 0x003E) { # >
6364 ## XML5: Switch to the "DOCTYPE bogus comment state".
6365 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6366 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6367
6368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6369 $self->{line_prev} = $self->{line};
6370 $self->{column_prev} = $self->{column};
6371 $self->{column}++;
6372 $self->{nc}
6373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6374 } else {
6375 $self->{set_nc}->($self);
6376 }
6377
6378 redo A;
6379 } else {
6380 ## XML5: Switch to the "DOCTYPE bogus comment state".
6381 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6382 $self->{state} = BEFORE_MD_NAME_STATE;
6383 redo A;
6384 }
6385 } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6386 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6387 ## before state", "DOCTYPE ATTLIST name before state".
6388
6389 if ($is_space->{$self->{nc}}) {
6390 ## Stay in the state.
6391
6392 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6393 $self->{line_prev} = $self->{line};
6394 $self->{column_prev} = $self->{column};
6395 $self->{column}++;
6396 $self->{nc}
6397 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6398 } else {
6399 $self->{set_nc}->($self);
6400 }
6401
6402 redo A;
6403 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6404 $self->{nc} == 0x0025) { # %
6405 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6406
6407 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6408 $self->{line_prev} = $self->{line};
6409 $self->{column_prev} = $self->{column};
6410 $self->{column}++;
6411 $self->{nc}
6412 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6413 } else {
6414 $self->{set_nc}->($self);
6415 }
6416
6417 redo A;
6418 } elsif ($self->{nc} == 0x003E) { # >
6419 ## XML5: Same as "Anything else".
6420 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6421 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6422
6423 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6424 $self->{line_prev} = $self->{line};
6425 $self->{column_prev} = $self->{column};
6426 $self->{column}++;
6427 $self->{nc}
6428 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6429 } else {
6430 $self->{set_nc}->($self);
6431 }
6432
6433 redo A;
6434 } elsif ($self->{nc} == -1) {
6435 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6436 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6437 ## Reconsume.
6438 redo A;
6439 } else {
6440 ## XML5: [ATTLIST] Not defined yet.
6441 $self->{ct}->{name} .= chr $self->{nc};
6442 $self->{state} = MD_NAME_STATE;
6443
6444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6445 $self->{line_prev} = $self->{line};
6446 $self->{column_prev} = $self->{column};
6447 $self->{column}++;
6448 $self->{nc}
6449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6450 } else {
6451 $self->{set_nc}->($self);
6452 }
6453
6454 redo A;
6455 }
6456 } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6457 if ($is_space->{$self->{nc}}) {
6458 ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6459 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6460 $self->{state} = BEFORE_MD_NAME_STATE;
6461
6462 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6463 $self->{line_prev} = $self->{line};
6464 $self->{column_prev} = $self->{column};
6465 $self->{column}++;
6466 $self->{nc}
6467 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6468 } else {
6469 $self->{set_nc}->($self);
6470 }
6471
6472 redo A;
6473 } elsif ($self->{nc} == 0x003E) { # >
6474 ## XML5: Same as "Anything else".
6475 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6476 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6477
6478 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6479 $self->{line_prev} = $self->{line};
6480 $self->{column_prev} = $self->{column};
6481 $self->{column}++;
6482 $self->{nc}
6483 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6484 } else {
6485 $self->{set_nc}->($self);
6486 }
6487
6488 redo A;
6489 } elsif ($self->{nc} == -1) {
6490 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6491 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6492 ## Reconsume.
6493 redo A;
6494 } else {
6495 ## XML5: No parse error.
6496 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6497 $self->{state} = BOGUS_COMMENT_STATE;
6498 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6499 ## Reconsume.
6500 redo A;
6501 }
6502 } elsif ($self->{state} == MD_NAME_STATE) {
6503 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6504
6505 if ($is_space->{$self->{nc}}) {
6506 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6507 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6508 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6509 $self->{state} = AFTER_ELEMENT_NAME_STATE;
6510 } else { # ENTITY/NOTATION
6511 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6512 }
6513
6514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515 $self->{line_prev} = $self->{line};
6516 $self->{column_prev} = $self->{column};
6517 $self->{column}++;
6518 $self->{nc}
6519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520 } else {
6521 $self->{set_nc}->($self);
6522 }
6523
6524 redo A;
6525 } elsif ($self->{nc} == 0x003E) { # >
6526 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6527 #
6528 } else {
6529 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6530 }
6531 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6532
6533 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6534 $self->{line_prev} = $self->{line};
6535 $self->{column_prev} = $self->{column};
6536 $self->{column}++;
6537 $self->{nc}
6538 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6539 } else {
6540 $self->{set_nc}->($self);
6541 }
6542
6543 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6544 redo A;
6545 } elsif ($self->{nc} == -1) {
6546 ## XML5: [ATTLIST] No parse error.
6547 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6548 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6549 ## Reconsume.
6550 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6551 redo A;
6552 } else {
6553 ## XML5: [ATTLIST] Not defined yet.
6554 $self->{ct}->{name} .= chr $self->{nc};
6555 ## Stay in the state.
6556
6557 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6558 $self->{line_prev} = $self->{line};
6559 $self->{column_prev} = $self->{column};
6560 $self->{column}++;
6561 $self->{nc}
6562 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6563 } else {
6564 $self->{set_nc}->($self);
6565 }
6566
6567 redo A;
6568 }
6569 } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6570 if ($is_space->{$self->{nc}}) {
6571 ## Stay in the state.
6572
6573 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6574 $self->{line_prev} = $self->{line};
6575 $self->{column_prev} = $self->{column};
6576 $self->{column}++;
6577 $self->{nc}
6578 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6579 } else {
6580 $self->{set_nc}->($self);
6581 }
6582
6583 redo A;
6584 } elsif ($self->{nc} == 0x003E) { # >
6585 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6586
6587 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6588 $self->{line_prev} = $self->{line};
6589 $self->{column_prev} = $self->{column};
6590 $self->{column}++;
6591 $self->{nc}
6592 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6593 } else {
6594 $self->{set_nc}->($self);
6595 }
6596
6597 return ($self->{ct}); # ATTLIST
6598 redo A;
6599 } elsif ($self->{nc} == -1) {
6600 ## XML5: No parse error.
6601 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6602 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6603 return ($self->{ct});
6604 redo A;
6605 } else {
6606 ## XML5: Not defined yet.
6607 $self->{ca} = {name => chr ($self->{nc}), # attrdef
6608 tokens => [],
6609 line => $self->{line}, column => $self->{column}};
6610 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6611
6612 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6613 $self->{line_prev} = $self->{line};
6614 $self->{column_prev} = $self->{column};
6615 $self->{column}++;
6616 $self->{nc}
6617 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6618 } else {
6619 $self->{set_nc}->($self);
6620 }
6621
6622 redo A;
6623 }
6624 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6625 if ($is_space->{$self->{nc}}) {
6626 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6627
6628 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629 $self->{line_prev} = $self->{line};
6630 $self->{column_prev} = $self->{column};
6631 $self->{column}++;
6632 $self->{nc}
6633 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634 } else {
6635 $self->{set_nc}->($self);
6636 }
6637
6638 redo A;
6639 } elsif ($self->{nc} == 0x003E) { # >
6640 ## XML5: Same as "anything else".
6641 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6642 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6643
6644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6645 $self->{line_prev} = $self->{line};
6646 $self->{column_prev} = $self->{column};
6647 $self->{column}++;
6648 $self->{nc}
6649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6650 } else {
6651 $self->{set_nc}->($self);
6652 }
6653
6654 return ($self->{ct}); # ATTLIST
6655 redo A;
6656 } elsif ($self->{nc} == 0x0028) { # (
6657 ## XML5: Same as "anything else".
6658 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6659 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6660
6661 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662 $self->{line_prev} = $self->{line};
6663 $self->{column_prev} = $self->{column};
6664 $self->{column}++;
6665 $self->{nc}
6666 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667 } else {
6668 $self->{set_nc}->($self);
6669 }
6670
6671 redo A;
6672 } elsif ($self->{nc} == -1) {
6673 ## XML5: No parse error.
6674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6675 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6676
6677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678 $self->{line_prev} = $self->{line};
6679 $self->{column_prev} = $self->{column};
6680 $self->{column}++;
6681 $self->{nc}
6682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683 } else {
6684 $self->{set_nc}->($self);
6685 }
6686
6687 return ($self->{ct}); # ATTLIST
6688 redo A;
6689 } else {
6690 ## XML5: Not defined yet.
6691 $self->{ca}->{name} .= chr $self->{nc};
6692 ## Stay in the state.
6693
6694 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6695 $self->{line_prev} = $self->{line};
6696 $self->{column_prev} = $self->{column};
6697 $self->{column}++;
6698 $self->{nc}
6699 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6700 } else {
6701 $self->{set_nc}->($self);
6702 }
6703
6704 redo A;
6705 }
6706 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6707 if ($is_space->{$self->{nc}}) {
6708 ## Stay in the state.
6709
6710 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6711 $self->{line_prev} = $self->{line};
6712 $self->{column_prev} = $self->{column};
6713 $self->{column}++;
6714 $self->{nc}
6715 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6716 } else {
6717 $self->{set_nc}->($self);
6718 }
6719
6720 redo A;
6721 } elsif ($self->{nc} == 0x003E) { # >
6722 ## XML5: Same as "anything else".
6723 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6725
6726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6727 $self->{line_prev} = $self->{line};
6728 $self->{column_prev} = $self->{column};
6729 $self->{column}++;
6730 $self->{nc}
6731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6732 } else {
6733 $self->{set_nc}->($self);
6734 }
6735
6736 return ($self->{ct}); # ATTLIST
6737 redo A;
6738 } elsif ($self->{nc} == 0x0028) { # (
6739 ## XML5: Same as "anything else".
6740 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6741
6742 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6743 $self->{line_prev} = $self->{line};
6744 $self->{column_prev} = $self->{column};
6745 $self->{column}++;
6746 $self->{nc}
6747 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6748 } else {
6749 $self->{set_nc}->($self);
6750 }
6751
6752 redo A;
6753 } elsif ($self->{nc} == -1) {
6754 ## XML5: No parse error.
6755 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6756 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6757
6758 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6759 $self->{line_prev} = $self->{line};
6760 $self->{column_prev} = $self->{column};
6761 $self->{column}++;
6762 $self->{nc}
6763 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6764 } else {
6765 $self->{set_nc}->($self);
6766 }
6767
6768 return ($self->{ct});
6769 redo A;
6770 } else {
6771 ## XML5: Not defined yet.
6772 $self->{ca}->{type} = chr $self->{nc};
6773 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6774
6775 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6776 $self->{line_prev} = $self->{line};
6777 $self->{column_prev} = $self->{column};
6778 $self->{column}++;
6779 $self->{nc}
6780 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6781 } else {
6782 $self->{set_nc}->($self);
6783 }
6784
6785 redo A;
6786 }
6787 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6788 if ($is_space->{$self->{nc}}) {
6789 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6790
6791 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6792 $self->{line_prev} = $self->{line};
6793 $self->{column_prev} = $self->{column};
6794 $self->{column}++;
6795 $self->{nc}
6796 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6797 } else {
6798 $self->{set_nc}->($self);
6799 }
6800
6801 redo A;
6802 } elsif ($self->{nc} == 0x0023) { # #
6803 ## XML5: Same as "anything else".
6804 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6805 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6806
6807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6808 $self->{line_prev} = $self->{line};
6809 $self->{column_prev} = $self->{column};
6810 $self->{column}++;
6811 $self->{nc}
6812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6813 } else {
6814 $self->{set_nc}->($self);
6815 }
6816
6817 redo A;
6818 } elsif ($self->{nc} == 0x0022) { # "
6819 ## XML5: Same as "anything else".
6820 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6821 $self->{ca}->{value} = '';
6822 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6823
6824 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6825 $self->{line_prev} = $self->{line};
6826 $self->{column_prev} = $self->{column};
6827 $self->{column}++;
6828 $self->{nc}
6829 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6830 } else {
6831 $self->{set_nc}->($self);
6832 }
6833
6834 redo A;
6835 } elsif ($self->{nc} == 0x0027) { # '
6836 ## XML5: Same as "anything else".
6837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6838 $self->{ca}->{value} = '';
6839 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6840
6841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842 $self->{line_prev} = $self->{line};
6843 $self->{column_prev} = $self->{column};
6844 $self->{column}++;
6845 $self->{nc}
6846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847 } else {
6848 $self->{set_nc}->($self);
6849 }
6850
6851 redo A;
6852 } elsif ($self->{nc} == 0x003E) { # >
6853 ## XML5: Same as "anything else".
6854 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6855 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6856
6857 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6858 $self->{line_prev} = $self->{line};
6859 $self->{column_prev} = $self->{column};
6860 $self->{column}++;
6861 $self->{nc}
6862 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6863 } else {
6864 $self->{set_nc}->($self);
6865 }
6866
6867 return ($self->{ct}); # ATTLIST
6868 redo A;
6869 } elsif ($self->{nc} == 0x0028) { # (
6870 ## XML5: Same as "anything else".
6871 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6872 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6873
6874 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6875 $self->{line_prev} = $self->{line};
6876 $self->{column_prev} = $self->{column};
6877 $self->{column}++;
6878 $self->{nc}
6879 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6880 } else {
6881 $self->{set_nc}->($self);
6882 }
6883
6884 redo A;
6885 } elsif ($self->{nc} == -1) {
6886 ## XML5: No parse error.
6887 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6888 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6889
6890 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6891 $self->{line_prev} = $self->{line};
6892 $self->{column_prev} = $self->{column};
6893 $self->{column}++;
6894 $self->{nc}
6895 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6896 } else {
6897 $self->{set_nc}->($self);
6898 }
6899
6900 return ($self->{ct});
6901 redo A;
6902 } else {
6903 ## XML5: Not defined yet.
6904 $self->{ca}->{type} .= chr $self->{nc};
6905 ## Stay in the state.
6906
6907 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908 $self->{line_prev} = $self->{line};
6909 $self->{column_prev} = $self->{column};
6910 $self->{column}++;
6911 $self->{nc}
6912 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913 } else {
6914 $self->{set_nc}->($self);
6915 }
6916
6917 redo A;
6918 }
6919 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6920 if ($is_space->{$self->{nc}}) {
6921 ## Stay in the state.
6922
6923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6924 $self->{line_prev} = $self->{line};
6925 $self->{column_prev} = $self->{column};
6926 $self->{column}++;
6927 $self->{nc}
6928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6929 } else {
6930 $self->{set_nc}->($self);
6931 }
6932
6933 redo A;
6934 } elsif ($self->{nc} == 0x0028) { # (
6935 ## XML5: Same as "anything else".
6936 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6937
6938 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6939 $self->{line_prev} = $self->{line};
6940 $self->{column_prev} = $self->{column};
6941 $self->{column}++;
6942 $self->{nc}
6943 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6944 } else {
6945 $self->{set_nc}->($self);
6946 }
6947
6948 redo A;
6949 } elsif ($self->{nc} == 0x0023) { # #
6950 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6951
6952 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6953 $self->{line_prev} = $self->{line};
6954 $self->{column_prev} = $self->{column};
6955 $self->{column}++;
6956 $self->{nc}
6957 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6958 } else {
6959 $self->{set_nc}->($self);
6960 }
6961
6962 redo A;
6963 } elsif ($self->{nc} == 0x0022) { # "
6964 ## XML5: Same as "anything else".
6965 $self->{ca}->{value} = '';
6966 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6967
6968 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6969 $self->{line_prev} = $self->{line};
6970 $self->{column_prev} = $self->{column};
6971 $self->{column}++;
6972 $self->{nc}
6973 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6974 } else {
6975 $self->{set_nc}->($self);
6976 }
6977
6978 redo A;
6979 } elsif ($self->{nc} == 0x0027) { # '
6980 ## XML5: Same as "anything else".
6981 $self->{ca}->{value} = '';
6982 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6983
6984 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6985 $self->{line_prev} = $self->{line};
6986 $self->{column_prev} = $self->{column};
6987 $self->{column}++;
6988 $self->{nc}
6989 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6990 } else {
6991 $self->{set_nc}->($self);
6992 }
6993
6994 redo A;
6995 } elsif ($self->{nc} == 0x003E) { # >
6996 ## XML5: Same as "anything else".
6997 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6998 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6999
7000 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7001 $self->{line_prev} = $self->{line};
7002 $self->{column_prev} = $self->{column};
7003 $self->{column}++;
7004 $self->{nc}
7005 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7006 } else {
7007 $self->{set_nc}->($self);
7008 }
7009
7010 return ($self->{ct}); # ATTLIST
7011 redo A;
7012 } elsif ($self->{nc} == -1) {
7013 ## XML5: No parse error.
7014 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7015 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7016
7017 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7018 $self->{line_prev} = $self->{line};
7019 $self->{column_prev} = $self->{column};
7020 $self->{column}++;
7021 $self->{nc}
7022 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7023 } else {
7024 $self->{set_nc}->($self);
7025 }
7026
7027 return ($self->{ct});
7028 redo A;
7029 } else {
7030 ## XML5: Switch to the "DOCTYPE bogus comment state".
7031 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7032 $self->{ca}->{value} = '';
7033 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7034 ## Reconsume.
7035 redo A;
7036 }
7037 } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7038 if ($is_space->{$self->{nc}}) {
7039 ## Stay in the state.
7040
7041 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7042 $self->{line_prev} = $self->{line};
7043 $self->{column_prev} = $self->{column};
7044 $self->{column}++;
7045 $self->{nc}
7046 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7047 } else {
7048 $self->{set_nc}->($self);
7049 }
7050
7051 redo A;
7052 } elsif ($self->{nc} == 0x007C) { # |
7053 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7054 ## Stay in the state.
7055
7056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7057 $self->{line_prev} = $self->{line};
7058 $self->{column_prev} = $self->{column};
7059 $self->{column}++;
7060 $self->{nc}
7061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7062 } else {
7063 $self->{set_nc}->($self);
7064 }
7065
7066 redo A;
7067 } elsif ($self->{nc} == 0x0029) { # )
7068 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7069 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7070
7071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7072 $self->{line_prev} = $self->{line};
7073 $self->{column_prev} = $self->{column};
7074 $self->{column}++;
7075 $self->{nc}
7076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7077 } else {
7078 $self->{set_nc}->($self);
7079 }
7080
7081 redo A;
7082 } elsif ($self->{nc} == 0x003E) { # >
7083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7084 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7085
7086 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7087 $self->{line_prev} = $self->{line};
7088 $self->{column_prev} = $self->{column};
7089 $self->{column}++;
7090 $self->{nc}
7091 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7092 } else {
7093 $self->{set_nc}->($self);
7094 }
7095
7096 return ($self->{ct}); # ATTLIST
7097 redo A;
7098 } elsif ($self->{nc} == -1) {
7099 ## XML5: No parse error.
7100 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7101 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7102
7103 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7104 $self->{line_prev} = $self->{line};
7105 $self->{column_prev} = $self->{column};
7106 $self->{column}++;
7107 $self->{nc}
7108 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7109 } else {
7110 $self->{set_nc}->($self);
7111 }
7112
7113 return ($self->{ct});
7114 redo A;
7115 } else {
7116 push @{$self->{ca}->{tokens}}, chr $self->{nc};
7117 $self->{state} = ALLOWED_TOKEN_STATE;
7118
7119 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7120 $self->{line_prev} = $self->{line};
7121 $self->{column_prev} = $self->{column};
7122 $self->{column}++;
7123 $self->{nc}
7124 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7125 } else {
7126 $self->{set_nc}->($self);
7127 }
7128
7129 redo A;
7130 }
7131 } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7132 if ($is_space->{$self->{nc}}) {
7133 $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7134
7135 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7136 $self->{line_prev} = $self->{line};
7137 $self->{column_prev} = $self->{column};
7138 $self->{column}++;
7139 $self->{nc}
7140 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7141 } else {
7142 $self->{set_nc}->($self);
7143 }
7144
7145 redo A;
7146 } elsif ($self->{nc} == 0x007C) { # |
7147 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7148
7149 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7150 $self->{line_prev} = $self->{line};
7151 $self->{column_prev} = $self->{column};
7152 $self->{column}++;
7153 $self->{nc}
7154 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7155 } else {
7156 $self->{set_nc}->($self);
7157 }
7158
7159 redo A;
7160 } elsif ($self->{nc} == 0x0029) { # )
7161 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7162
7163 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164 $self->{line_prev} = $self->{line};
7165 $self->{column_prev} = $self->{column};
7166 $self->{column}++;
7167 $self->{nc}
7168 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169 } else {
7170 $self->{set_nc}->($self);
7171 }
7172
7173 redo A;
7174 } elsif ($self->{nc} == 0x003E) { # >
7175 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7176 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7177
7178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7179 $self->{line_prev} = $self->{line};
7180 $self->{column_prev} = $self->{column};
7181 $self->{column}++;
7182 $self->{nc}
7183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7184 } else {
7185 $self->{set_nc}->($self);
7186 }
7187
7188 return ($self->{ct}); # ATTLIST
7189 redo A;
7190 } elsif ($self->{nc} == -1) {
7191 ## XML5: No parse error.
7192 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7193 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7194
7195 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7196 $self->{line_prev} = $self->{line};
7197 $self->{column_prev} = $self->{column};
7198 $self->{column}++;
7199 $self->{nc}
7200 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7201 } else {
7202 $self->{set_nc}->($self);
7203 }
7204
7205 return ($self->{ct});
7206 redo A;
7207 } else {
7208 $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7209 ## Stay in the state.
7210
7211 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7212 $self->{line_prev} = $self->{line};
7213 $self->{column_prev} = $self->{column};
7214 $self->{column}++;
7215 $self->{nc}
7216 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7217 } else {
7218 $self->{set_nc}->($self);
7219 }
7220
7221 redo A;
7222 }
7223 } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7224 if ($is_space->{$self->{nc}}) {
7225 ## Stay in the state.
7226
7227 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228 $self->{line_prev} = $self->{line};
7229 $self->{column_prev} = $self->{column};
7230 $self->{column}++;
7231 $self->{nc}
7232 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233 } else {
7234 $self->{set_nc}->($self);
7235 }
7236
7237 redo A;
7238 } elsif ($self->{nc} == 0x007C) { # |
7239 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7240
7241 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7242 $self->{line_prev} = $self->{line};
7243 $self->{column_prev} = $self->{column};
7244 $self->{column}++;
7245 $self->{nc}
7246 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7247 } else {
7248 $self->{set_nc}->($self);
7249 }
7250
7251 redo A;
7252 } elsif ($self->{nc} == 0x0029) { # )
7253 $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7254
7255 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7256 $self->{line_prev} = $self->{line};
7257 $self->{column_prev} = $self->{column};
7258 $self->{column}++;
7259 $self->{nc}
7260 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7261 } else {
7262 $self->{set_nc}->($self);
7263 }
7264
7265 redo A;
7266 } elsif ($self->{nc} == 0x003E) { # >
7267 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7268 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7269
7270 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271 $self->{line_prev} = $self->{line};
7272 $self->{column_prev} = $self->{column};
7273 $self->{column}++;
7274 $self->{nc}
7275 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276 } else {
7277 $self->{set_nc}->($self);
7278 }
7279
7280 return ($self->{ct}); # ATTLIST
7281 redo A;
7282 } elsif ($self->{nc} == -1) {
7283 ## XML5: No parse error.
7284 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7285 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7286
7287 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7288 $self->{line_prev} = $self->{line};
7289 $self->{column_prev} = $self->{column};
7290 $self->{column}++;
7291 $self->{nc}
7292 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7293 } else {
7294 $self->{set_nc}->($self);
7295 }
7296
7297 return ($self->{ct});
7298 redo A;
7299 } else {
7300 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7301 line => $self->{line_prev},
7302 column => $self->{column_prev});
7303 $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7304 $self->{state} = ALLOWED_TOKEN_STATE;
7305
7306 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7307 $self->{line_prev} = $self->{line};
7308 $self->{column_prev} = $self->{column};
7309 $self->{column}++;
7310 $self->{nc}
7311 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7312 } else {
7313 $self->{set_nc}->($self);
7314 }
7315
7316 redo A;
7317 }
7318 } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7319 if ($is_space->{$self->{nc}}) {
7320 $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7321
7322 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323 $self->{line_prev} = $self->{line};
7324 $self->{column_prev} = $self->{column};
7325 $self->{column}++;
7326 $self->{nc}
7327 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328 } else {
7329 $self->{set_nc}->($self);
7330 }
7331
7332 redo A;
7333 } elsif ($self->{nc} == 0x0023) { # #
7334 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7335 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7336
7337 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7338 $self->{line_prev} = $self->{line};
7339 $self->{column_prev} = $self->{column};
7340 $self->{column}++;
7341 $self->{nc}
7342 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7343 } else {
7344 $self->{set_nc}->($self);
7345 }
7346
7347 redo A;
7348 } elsif ($self->{nc} == 0x0022) { # "
7349 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7350 $self->{ca}->{value} = '';
7351 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7352
7353 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7354 $self->{line_prev} = $self->{line};
7355 $self->{column_prev} = $self->{column};
7356 $self->{column}++;
7357 $self->{nc}
7358 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7359 } else {
7360 $self->{set_nc}->($self);
7361 }
7362
7363 redo A;
7364 } elsif ($self->{nc} == 0x0027) { # '
7365 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7366 $self->{ca}->{value} = '';
7367 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7368
7369 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7370 $self->{line_prev} = $self->{line};
7371 $self->{column_prev} = $self->{column};
7372 $self->{column}++;
7373 $self->{nc}
7374 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7375 } else {
7376 $self->{set_nc}->($self);
7377 }
7378
7379 redo A;
7380 } elsif ($self->{nc} == 0x003E) { # >
7381 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7382 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7383
7384 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385 $self->{line_prev} = $self->{line};
7386 $self->{column_prev} = $self->{column};
7387 $self->{column}++;
7388 $self->{nc}
7389 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390 } else {
7391 $self->{set_nc}->($self);
7392 }
7393
7394 return ($self->{ct}); # ATTLIST
7395 redo A;
7396 } elsif ($self->{nc} == -1) {
7397 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7398 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7399
7400 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401 $self->{line_prev} = $self->{line};
7402 $self->{column_prev} = $self->{column};
7403 $self->{column}++;
7404 $self->{nc}
7405 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406 } else {
7407 $self->{set_nc}->($self);
7408 }
7409
7410 return ($self->{ct});
7411 redo A;
7412 } else {
7413 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7414 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7415 ## Reconsume.
7416 redo A;
7417 }
7418 } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7419 if ($is_space->{$self->{nc}}) {
7420 ## Stay in the state.
7421
7422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423 $self->{line_prev} = $self->{line};
7424 $self->{column_prev} = $self->{column};
7425 $self->{column}++;
7426 $self->{nc}
7427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428 } else {
7429 $self->{set_nc}->($self);
7430 }
7431
7432 redo A;
7433 } elsif ($self->{nc} == 0x0023) { # #
7434 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7435
7436 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7437 $self->{line_prev} = $self->{line};
7438 $self->{column_prev} = $self->{column};
7439 $self->{column}++;
7440 $self->{nc}
7441 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7442 } else {
7443 $self->{set_nc}->($self);
7444 }
7445
7446 redo A;
7447 } elsif ($self->{nc} == 0x0022) { # "
7448 $self->{ca}->{value} = '';
7449 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7450
7451 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7452 $self->{line_prev} = $self->{line};
7453 $self->{column_prev} = $self->{column};
7454 $self->{column}++;
7455 $self->{nc}
7456 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7457 } else {
7458 $self->{set_nc}->($self);
7459 }
7460
7461 redo A;
7462 } elsif ($self->{nc} == 0x0027) { # '
7463 $self->{ca}->{value} = '';
7464 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7465
7466 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7467 $self->{line_prev} = $self->{line};
7468 $self->{column_prev} = $self->{column};
7469 $self->{column}++;
7470 $self->{nc}
7471 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7472 } else {
7473 $self->{set_nc}->($self);
7474 }
7475
7476 redo A;
7477 } elsif ($self->{nc} == 0x003E) { # >
7478 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7479 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7480
7481 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7482 $self->{line_prev} = $self->{line};
7483 $self->{column_prev} = $self->{column};
7484 $self->{column}++;
7485 $self->{nc}
7486 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7487 } else {
7488 $self->{set_nc}->($self);
7489 }
7490
7491 return ($self->{ct}); # ATTLIST
7492 redo A;
7493 } elsif ($self->{nc} == -1) {
7494 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7495 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7496
7497 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7498 $self->{line_prev} = $self->{line};
7499 $self->{column_prev} = $self->{column};
7500 $self->{column}++;
7501 $self->{nc}
7502 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7503 } else {
7504 $self->{set_nc}->($self);
7505 }
7506
7507 return ($self->{ct});
7508 redo A;
7509 } else {
7510 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7511 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7512 ## Reconsume.
7513 redo A;
7514 }
7515 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7516 if ($is_space->{$self->{nc}}) {
7517 ## XML5: No parse error.
7518 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7519 $self->{state} = BOGUS_MD_STATE;
7520 ## Reconsume.
7521 redo A;
7522 } elsif ($self->{nc} == 0x0022) { # "
7523 ## XML5: Same as "anything else".
7524 $self->{ca}->{value} = '';
7525 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7526
7527 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7528 $self->{line_prev} = $self->{line};
7529 $self->{column_prev} = $self->{column};
7530 $self->{column}++;
7531 $self->{nc}
7532 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7533 } else {
7534 $self->{set_nc}->($self);
7535 }
7536
7537 redo A;
7538 } elsif ($self->{nc} == 0x0027) { # '
7539 ## XML5: Same as "anything else".
7540 $self->{ca}->{value} = '';
7541 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7542
7543 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7544 $self->{line_prev} = $self->{line};
7545 $self->{column_prev} = $self->{column};
7546 $self->{column}++;
7547 $self->{nc}
7548 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7549 } else {
7550 $self->{set_nc}->($self);
7551 }
7552
7553 redo A;
7554 } elsif ($self->{nc} == 0x003E) { # >
7555 ## XML5: Same as "anything else".
7556 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7557 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7558
7559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560 $self->{line_prev} = $self->{line};
7561 $self->{column_prev} = $self->{column};
7562 $self->{column}++;
7563 $self->{nc}
7564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565 } else {
7566 $self->{set_nc}->($self);
7567 }
7568
7569 return ($self->{ct}); # ATTLIST
7570 redo A;
7571 } elsif ($self->{nc} == -1) {
7572 ## XML5: No parse error.
7573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7574 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7575
7576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7577 $self->{line_prev} = $self->{line};
7578 $self->{column_prev} = $self->{column};
7579 $self->{column}++;
7580 $self->{nc}
7581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7582 } else {
7583 $self->{set_nc}->($self);
7584 }
7585
7586 return ($self->{ct});
7587 redo A;
7588 } else {
7589 $self->{ca}->{default} = chr $self->{nc};
7590 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7591
7592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7593 $self->{line_prev} = $self->{line};
7594 $self->{column_prev} = $self->{column};
7595 $self->{column}++;
7596 $self->{nc}
7597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7598 } else {
7599 $self->{set_nc}->($self);
7600 }
7601
7602 redo A;
7603 }
7604 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7605 if ($is_space->{$self->{nc}}) {
7606 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7607
7608 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7609 $self->{line_prev} = $self->{line};
7610 $self->{column_prev} = $self->{column};
7611 $self->{column}++;
7612 $self->{nc}
7613 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7614 } else {
7615 $self->{set_nc}->($self);
7616 }
7617
7618 redo A;
7619 } elsif ($self->{nc} == 0x0022) { # "
7620 ## XML5: Same as "anything else".
7621 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7622 $self->{ca}->{value} = '';
7623 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7624
7625 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7626 $self->{line_prev} = $self->{line};
7627 $self->{column_prev} = $self->{column};
7628 $self->{column}++;
7629 $self->{nc}
7630 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7631 } else {
7632 $self->{set_nc}->($self);
7633 }
7634
7635 redo A;
7636 } elsif ($self->{nc} == 0x0027) { # '
7637 ## XML5: Same as "anything else".
7638 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7639 $self->{ca}->{value} = '';
7640 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7641
7642 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643 $self->{line_prev} = $self->{line};
7644 $self->{column_prev} = $self->{column};
7645 $self->{column}++;
7646 $self->{nc}
7647 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648 } else {
7649 $self->{set_nc}->($self);
7650 }
7651
7652 redo A;
7653 } elsif ($self->{nc} == 0x003E) { # >
7654 ## XML5: Same as "anything else".
7655 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7656 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7657
7658 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7659 $self->{line_prev} = $self->{line};
7660 $self->{column_prev} = $self->{column};
7661 $self->{column}++;
7662 $self->{nc}
7663 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7664 } else {
7665 $self->{set_nc}->($self);
7666 }
7667
7668 return ($self->{ct}); # ATTLIST
7669 redo A;
7670 } elsif ($self->{nc} == -1) {
7671 ## XML5: No parse error.
7672 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7673 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7674 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7675
7676 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7677 $self->{line_prev} = $self->{line};
7678 $self->{column_prev} = $self->{column};
7679 $self->{column}++;
7680 $self->{nc}
7681 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7682 } else {
7683 $self->{set_nc}->($self);
7684 }
7685
7686 return ($self->{ct});
7687 redo A;
7688 } else {
7689 $self->{ca}->{default} .= chr $self->{nc};
7690 ## Stay in the state.
7691
7692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693 $self->{line_prev} = $self->{line};
7694 $self->{column_prev} = $self->{column};
7695 $self->{column}++;
7696 $self->{nc}
7697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698 } else {
7699 $self->{set_nc}->($self);
7700 }
7701
7702 redo A;
7703 }
7704 } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7705 if ($is_space->{$self->{nc}}) {
7706 ## Stay in the state.
7707
7708 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709 $self->{line_prev} = $self->{line};
7710 $self->{column_prev} = $self->{column};
7711 $self->{column}++;
7712 $self->{nc}
7713 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714 } else {
7715 $self->{set_nc}->($self);
7716 }
7717
7718 redo A;
7719 } elsif ($self->{nc} == 0x0022) { # "
7720 $self->{ca}->{value} = '';
7721 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7722
7723 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7724 $self->{line_prev} = $self->{line};
7725 $self->{column_prev} = $self->{column};
7726 $self->{column}++;
7727 $self->{nc}
7728 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7729 } else {
7730 $self->{set_nc}->($self);
7731 }
7732
7733 redo A;
7734 } elsif ($self->{nc} == 0x0027) { # '
7735 $self->{ca}->{value} = '';
7736 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7737
7738 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7739 $self->{line_prev} = $self->{line};
7740 $self->{column_prev} = $self->{column};
7741 $self->{column}++;
7742 $self->{nc}
7743 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7744 } else {
7745 $self->{set_nc}->($self);
7746 }
7747
7748 redo A;
7749 } elsif ($self->{nc} == 0x003E) { # >
7750 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7751 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7752
7753 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7754 $self->{line_prev} = $self->{line};
7755 $self->{column_prev} = $self->{column};
7756 $self->{column}++;
7757 $self->{nc}
7758 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7759 } else {
7760 $self->{set_nc}->($self);
7761 }
7762
7763 return ($self->{ct}); # ATTLIST
7764 redo A;
7765 } elsif ($self->{nc} == -1) {
7766 ## XML5: No parse error.
7767 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7768 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7769 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7770
7771 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772 $self->{line_prev} = $self->{line};
7773 $self->{column_prev} = $self->{column};
7774 $self->{column}++;
7775 $self->{nc}
7776 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777 } else {
7778 $self->{set_nc}->($self);
7779 }
7780
7781 return ($self->{ct});
7782 redo A;
7783 } else {
7784 ## XML5: Not defined yet.
7785 if ($self->{ca}->{default} eq 'FIXED') {
7786 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7787 } else {
7788 push @{$self->{ct}->{attrdefs}}, $self->{ca};
7789 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7790 }
7791 ## Reconsume.
7792 redo A;
7793 }
7794 } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7795 if ($is_space->{$self->{nc}} or
7796 $self->{nc} == -1 or
7797 $self->{nc} == 0x003E) { # >
7798 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7799 ## Reconsume.
7800 redo A;
7801 } else {
7802 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7803 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7804 ## Reconsume.
7805 redo A;
7806 }
7807 } elsif ($self->{state} == NDATA_STATE) {
7808 ## ASCII case-insensitive
7809 if ($self->{nc} == [
7810 undef,
7811 0x0044, # D
7812 0x0041, # A
7813 0x0054, # T
7814 ]->[length $self->{kwd}] or
7815 $self->{nc} == [
7816 undef,
7817 0x0064, # d
7818 0x0061, # a
7819 0x0074, # t
7820 ]->[length $self->{kwd}]) {
7821
7822 ## Stay in the state.
7823 $self->{kwd} .= chr $self->{nc};
7824
7825 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7826 $self->{line_prev} = $self->{line};
7827 $self->{column_prev} = $self->{column};
7828 $self->{column}++;
7829 $self->{nc}
7830 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7831 } else {
7832 $self->{set_nc}->($self);
7833 }
7834
7835 redo A;
7836 } elsif ((length $self->{kwd}) == 4 and
7837 ($self->{nc} == 0x0041 or # A
7838 $self->{nc} == 0x0061)) { # a
7839 if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7840
7841 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7842 text => 'NDATA',
7843 line => $self->{line_prev},
7844 column => $self->{column_prev} - 4);
7845 } else {
7846
7847 }
7848 $self->{state} = AFTER_NDATA_STATE;
7849
7850 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7851 $self->{line_prev} = $self->{line};
7852 $self->{column_prev} = $self->{column};
7853 $self->{column}++;
7854 $self->{nc}
7855 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7856 } else {
7857 $self->{set_nc}->($self);
7858 }
7859
7860 redo A;
7861 } else {
7862 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7863 line => $self->{line_prev},
7864 column => $self->{column_prev} + 1
7865 - length $self->{kwd});
7866
7867 $self->{state} = BOGUS_MD_STATE;
7868 ## Reconsume.
7869 redo A;
7870 }
7871 } elsif ($self->{state} == AFTER_NDATA_STATE) {
7872 if ($is_space->{$self->{nc}}) {
7873 $self->{state} = BEFORE_NOTATION_NAME_STATE;
7874
7875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7876 $self->{line_prev} = $self->{line};
7877 $self->{column_prev} = $self->{column};
7878 $self->{column}++;
7879 $self->{nc}
7880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7881 } else {
7882 $self->{set_nc}->($self);
7883 }
7884
7885 redo A;
7886 } elsif ($self->{nc} == 0x003E) { # >
7887 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7888 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7889
7890 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7891 $self->{line_prev} = $self->{line};
7892 $self->{column_prev} = $self->{column};
7893 $self->{column}++;
7894 $self->{nc}
7895 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7896 } else {
7897 $self->{set_nc}->($self);
7898 }
7899
7900 return ($self->{ct}); # ENTITY
7901 redo A;
7902 } elsif ($self->{nc} == -1) {
7903 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7904 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7905
7906 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907 $self->{line_prev} = $self->{line};
7908 $self->{column_prev} = $self->{column};
7909 $self->{column}++;
7910 $self->{nc}
7911 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912 } else {
7913 $self->{set_nc}->($self);
7914 }
7915
7916 return ($self->{ct}); # ENTITY
7917 redo A;
7918 } else {
7919 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7920 line => $self->{line_prev},
7921 column => $self->{column_prev} + 1
7922 - length $self->{kwd});
7923 $self->{state} = BOGUS_MD_STATE;
7924 ## Reconsume.
7925 redo A;
7926 }
7927 } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7928 if ($is_space->{$self->{nc}}) {
7929 ## Stay in the state.
7930
7931 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7932 $self->{line_prev} = $self->{line};
7933 $self->{column_prev} = $self->{column};
7934 $self->{column}++;
7935 $self->{nc}
7936 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7937 } else {
7938 $self->{set_nc}->($self);
7939 }
7940
7941 redo A;
7942 } elsif ($self->{nc} == 0x003E) { # >
7943 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7944 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7945
7946 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7947 $self->{line_prev} = $self->{line};
7948 $self->{column_prev} = $self->{column};
7949 $self->{column}++;
7950 $self->{nc}
7951 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7952 } else {
7953 $self->{set_nc}->($self);
7954 }
7955
7956 return ($self->{ct}); # ENTITY
7957 redo A;
7958 } elsif ($self->{nc} == -1) {
7959 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7960 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7961
7962 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7963 $self->{line_prev} = $self->{line};
7964 $self->{column_prev} = $self->{column};
7965 $self->{column}++;
7966 $self->{nc}
7967 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7968 } else {
7969 $self->{set_nc}->($self);
7970 }
7971
7972 return ($self->{ct}); # ENTITY
7973 redo A;
7974 } else {
7975 $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7976 $self->{state} = NOTATION_NAME_STATE;
7977
7978 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7979 $self->{line_prev} = $self->{line};
7980 $self->{column_prev} = $self->{column};
7981 $self->{column}++;
7982 $self->{nc}
7983 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7984 } else {
7985 $self->{set_nc}->($self);
7986 }
7987
7988 redo A;
7989 }
7990 } elsif ($self->{state} == NOTATION_NAME_STATE) {
7991 if ($is_space->{$self->{nc}}) {
7992 $self->{state} = AFTER_MD_DEF_STATE;
7993
7994 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7995 $self->{line_prev} = $self->{line};
7996 $self->{column_prev} = $self->{column};
7997 $self->{column}++;
7998 $self->{nc}
7999 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8000 } else {
8001 $self->{set_nc}->($self);
8002 }
8003
8004 redo A;
8005 } elsif ($self->{nc} == 0x003E) { # >
8006 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8007
8008 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009 $self->{line_prev} = $self->{line};
8010 $self->{column_prev} = $self->{column};
8011 $self->{column}++;
8012 $self->{nc}
8013 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014 } else {
8015 $self->{set_nc}->($self);
8016 }
8017
8018 return ($self->{ct}); # ENTITY
8019 redo A;
8020 } elsif ($self->{nc} == -1) {
8021 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8022 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8023
8024 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8025 $self->{line_prev} = $self->{line};
8026 $self->{column_prev} = $self->{column};
8027 $self->{column}++;
8028 $self->{nc}
8029 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8030 } else {
8031 $self->{set_nc}->($self);
8032 }
8033
8034 return ($self->{ct}); # ENTITY
8035 redo A;
8036 } else {
8037 $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8038 ## Stay in the state.
8039
8040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8041 $self->{line_prev} = $self->{line};
8042 $self->{column_prev} = $self->{column};
8043 $self->{column}++;
8044 $self->{nc}
8045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8046 } else {
8047 $self->{set_nc}->($self);
8048 }
8049
8050 redo A;
8051 }
8052 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8053 if ($self->{nc} == 0x0022) { # "
8054 $self->{state} = AFTER_MD_DEF_STATE;
8055
8056 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8057 $self->{line_prev} = $self->{line};
8058 $self->{column_prev} = $self->{column};
8059 $self->{column}++;
8060 $self->{nc}
8061 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8062 } else {
8063 $self->{set_nc}->($self);
8064 }
8065
8066 redo A;
8067 } elsif ($self->{nc} == 0x0026) { # &
8068 $self->{prev_state} = $self->{state};
8069 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8070 $self->{entity_add} = 0x0022; # "
8071
8072 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8073 $self->{line_prev} = $self->{line};
8074 $self->{column_prev} = $self->{column};
8075 $self->{column}++;
8076 $self->{nc}
8077 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8078 } else {
8079 $self->{set_nc}->($self);
8080 }
8081
8082 redo A;
8083 ## TODO: %
8084 } elsif ($self->{nc} == -1) {
8085 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8086 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8087 ## Reconsume.
8088 return ($self->{ct}); # ENTITY
8089 redo A;
8090 } else {
8091 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8092
8093 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094 $self->{line_prev} = $self->{line};
8095 $self->{column_prev} = $self->{column};
8096 $self->{column}++;
8097 $self->{nc}
8098 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099 } else {
8100 $self->{set_nc}->($self);
8101 }
8102
8103 redo A;
8104 }
8105 } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8106 if ($self->{nc} == 0x0027) { # '
8107 $self->{state} = AFTER_MD_DEF_STATE;
8108
8109 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8110 $self->{line_prev} = $self->{line};
8111 $self->{column_prev} = $self->{column};
8112 $self->{column}++;
8113 $self->{nc}
8114 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8115 } else {
8116 $self->{set_nc}->($self);
8117 }
8118
8119 redo A;
8120 } elsif ($self->{nc} == 0x0026) { # &
8121 $self->{prev_state} = $self->{state};
8122 $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8123 $self->{entity_add} = 0x0027; # '
8124
8125 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8126 $self->{line_prev} = $self->{line};
8127 $self->{column_prev} = $self->{column};
8128 $self->{column}++;
8129 $self->{nc}
8130 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8131 } else {
8132 $self->{set_nc}->($self);
8133 }
8134
8135 redo A;
8136 ## TODO: %
8137 } elsif ($self->{nc} == -1) {
8138 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8139 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8140 ## Reconsume.
8141 return ($self->{ct}); # ENTITY
8142 redo A;
8143 } else {
8144 $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8145
8146 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8147 $self->{line_prev} = $self->{line};
8148 $self->{column_prev} = $self->{column};
8149 $self->{column}++;
8150 $self->{nc}
8151 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8152 } else {
8153 $self->{set_nc}->($self);
8154 }
8155
8156 redo A;
8157 }
8158 } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8159 if ($is_space->{$self->{nc}} or
8160 {
8161 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8162 $self->{entity_add} => 1,
8163 }->{$self->{nc}}) {
8164 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8165 line => $self->{line_prev},
8166 column => $self->{column_prev}
8167 + ($self->{nc} == -1 ? 1 : 0));
8168 ## Don't consume
8169 ## Return nothing.
8170 #
8171 } elsif ($self->{nc} == 0x0023) { # #
8172 $self->{ca} = $self->{ct};
8173 $self->{state} = ENTITY_HASH_STATE;
8174 $self->{kwd} = '#';
8175
8176 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8177 $self->{line_prev} = $self->{line};
8178 $self->{column_prev} = $self->{column};
8179 $self->{column}++;
8180 $self->{nc}
8181 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8182 } else {
8183 $self->{set_nc}->($self);
8184 }
8185
8186 redo A;
8187 } else {
8188 #
8189 }
8190
8191 $self->{ct}->{value} .= '&';
8192 $self->{state} = $self->{prev_state};
8193 ## Reconsume.
8194 redo A;
8195 } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8196 if ($is_space->{$self->{nc}}) {
8197 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8198
8199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8200 $self->{line_prev} = $self->{line};
8201 $self->{column_prev} = $self->{column};
8202 $self->{column}++;
8203 $self->{nc}
8204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8205 } else {
8206 $self->{set_nc}->($self);
8207 }
8208
8209 redo A;
8210 } elsif ($self->{nc} == 0x0028) { # (
8211 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8212 $self->{ct}->{content} = ['('];
8213 $self->{group_depth} = 1;
8214
8215 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8216 $self->{line_prev} = $self->{line};
8217 $self->{column_prev} = $self->{column};
8218 $self->{column}++;
8219 $self->{nc}
8220 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8221 } else {
8222 $self->{set_nc}->($self);
8223 }
8224
8225 redo A;
8226 } elsif ($self->{nc} == 0x003E) { # >
8227 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8228 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8229
8230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231 $self->{line_prev} = $self->{line};
8232 $self->{column_prev} = $self->{column};
8233 $self->{column}++;
8234 $self->{nc}
8235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236 } else {
8237 $self->{set_nc}->($self);
8238 }
8239
8240 return ($self->{ct}); # ELEMENT
8241 redo A;
8242 } elsif ($self->{nc} == -1) {
8243 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8244 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8245
8246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247 $self->{line_prev} = $self->{line};
8248 $self->{column_prev} = $self->{column};
8249 $self->{column}++;
8250 $self->{nc}
8251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252 } else {
8253 $self->{set_nc}->($self);
8254 }
8255
8256 return ($self->{ct}); # ELEMENT
8257 redo A;
8258 } else {
8259 $self->{ct}->{content} = [chr $self->{nc}];
8260 $self->{state} = CONTENT_KEYWORD_STATE;
8261
8262 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8263 $self->{line_prev} = $self->{line};
8264 $self->{column_prev} = $self->{column};
8265 $self->{column}++;
8266 $self->{nc}
8267 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8268 } else {
8269 $self->{set_nc}->($self);
8270 }
8271
8272 redo A;
8273 }
8274 } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8275 if ($is_space->{$self->{nc}}) {
8276 $self->{state} = AFTER_MD_DEF_STATE;
8277
8278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8279 $self->{line_prev} = $self->{line};
8280 $self->{column_prev} = $self->{column};
8281 $self->{column}++;
8282 $self->{nc}
8283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8284 } else {
8285 $self->{set_nc}->($self);
8286 }
8287
8288 redo A;
8289 } elsif ($self->{nc} == 0x003E) { # >
8290 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8291
8292 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8293 $self->{line_prev} = $self->{line};
8294 $self->{column_prev} = $self->{column};
8295 $self->{column}++;
8296 $self->{nc}
8297 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8298 } else {
8299 $self->{set_nc}->($self);
8300 }
8301
8302 return ($self->{ct}); # ELEMENT
8303 redo A;
8304 } elsif ($self->{nc} == -1) {
8305 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8306 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8307
8308 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8309 $self->{line_prev} = $self->{line};
8310 $self->{column_prev} = $self->{column};
8311 $self->{column}++;
8312 $self->{nc}
8313 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8314 } else {
8315 $self->{set_nc}->($self);
8316 }
8317
8318 return ($self->{ct}); # ELEMENT
8319 redo A;
8320 } else {
8321 $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8322 ## Stay in the state.
8323
8324 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8325 $self->{line_prev} = $self->{line};
8326 $self->{column_prev} = $self->{column};
8327 $self->{column}++;
8328 $self->{nc}
8329 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8330 } else {
8331 $self->{set_nc}->($self);
8332 }
8333
8334 redo A;
8335 }
8336 } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8337 if ($is_space->{$self->{nc}}) {
8338 ## Stay in the state.
8339
8340 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8341 $self->{line_prev} = $self->{line};
8342 $self->{column_prev} = $self->{column};
8343 $self->{column}++;
8344 $self->{nc}
8345 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8346 } else {
8347 $self->{set_nc}->($self);
8348 }
8349
8350 redo A;
8351 } elsif ($self->{nc} == 0x0028) { # (
8352 $self->{group_depth}++;
8353 push @{$self->{ct}->{content}}, chr $self->{nc};
8354 ## Stay in the state.
8355
8356 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8357 $self->{line_prev} = $self->{line};
8358 $self->{column_prev} = $self->{column};
8359 $self->{column}++;
8360 $self->{nc}
8361 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8362 } else {
8363 $self->{set_nc}->($self);
8364 }
8365
8366 redo A;
8367 } elsif ($self->{nc} == 0x007C or # |
8368 $self->{nc} == 0x002C) { # ,
8369 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8370 ## Stay in the state.
8371
8372 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8373 $self->{line_prev} = $self->{line};
8374 $self->{column_prev} = $self->{column};
8375 $self->{column}++;
8376 $self->{nc}
8377 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8378 } else {
8379 $self->{set_nc}->($self);
8380 }
8381
8382 redo A;
8383 } elsif ($self->{nc} == 0x0029) { # )
8384 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8385 push @{$self->{ct}->{content}}, chr $self->{nc};
8386 $self->{group_depth}--;
8387 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8388
8389 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8390 $self->{line_prev} = $self->{line};
8391 $self->{column_prev} = $self->{column};
8392 $self->{column}++;
8393 $self->{nc}
8394 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8395 } else {
8396 $self->{set_nc}->($self);
8397 }
8398
8399 redo A;
8400 } elsif ($self->{nc} == 0x003E) { # >
8401 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8402 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8403 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8404
8405 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8406 $self->{line_prev} = $self->{line};
8407 $self->{column_prev} = $self->{column};
8408 $self->{column}++;
8409 $self->{nc}
8410 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8411 } else {
8412 $self->{set_nc}->($self);
8413 }
8414
8415 return ($self->{ct}); # ELEMENT
8416 redo A;
8417 } elsif ($self->{nc} == -1) {
8418 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8419 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8420 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8421
8422 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423 $self->{line_prev} = $self->{line};
8424 $self->{column_prev} = $self->{column};
8425 $self->{column}++;
8426 $self->{nc}
8427 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428 } else {
8429 $self->{set_nc}->($self);
8430 }
8431
8432 return ($self->{ct}); # ELEMENT
8433 redo A;
8434 } else {
8435 push @{$self->{ct}->{content}}, chr $self->{nc};
8436 $self->{state} = CM_ELEMENT_NAME_STATE;
8437
8438 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8439 $self->{line_prev} = $self->{line};
8440 $self->{column_prev} = $self->{column};
8441 $self->{column}++;
8442 $self->{nc}
8443 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8444 } else {
8445 $self->{set_nc}->($self);
8446 }
8447
8448 redo A;
8449 }
8450 } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8451 if ($is_space->{$self->{nc}}) {
8452 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8453
8454 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455 $self->{line_prev} = $self->{line};
8456 $self->{column_prev} = $self->{column};
8457 $self->{column}++;
8458 $self->{nc}
8459 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460 } else {
8461 $self->{set_nc}->($self);
8462 }
8463
8464 redo A;
8465 } elsif ($self->{nc} == 0x002A or # *
8466 $self->{nc} == 0x002B or # +
8467 $self->{nc} == 0x003F) { # ?
8468 push @{$self->{ct}->{content}}, chr $self->{nc};
8469 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8470
8471 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8472 $self->{line_prev} = $self->{line};
8473 $self->{column_prev} = $self->{column};
8474 $self->{column}++;
8475 $self->{nc}
8476 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8477 } else {
8478 $self->{set_nc}->($self);
8479 }
8480
8481 redo A;
8482 } elsif ($self->{nc} == 0x007C or # |
8483 $self->{nc} == 0x002C) { # ,
8484 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8485 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8486
8487 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8488 $self->{line_prev} = $self->{line};
8489 $self->{column_prev} = $self->{column};
8490 $self->{column}++;
8491 $self->{nc}
8492 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8493 } else {
8494 $self->{set_nc}->($self);
8495 }
8496
8497 redo A;
8498 } elsif ($self->{nc} == 0x0029) { # )
8499 $self->{group_depth}--;
8500 push @{$self->{ct}->{content}}, chr $self->{nc};
8501 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8502
8503 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504 $self->{line_prev} = $self->{line};
8505 $self->{column_prev} = $self->{column};
8506 $self->{column}++;
8507 $self->{nc}
8508 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509 } else {
8510 $self->{set_nc}->($self);
8511 }
8512
8513 redo A;
8514 } elsif ($self->{nc} == 0x003E) { # >
8515 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8516 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8517 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8518
8519 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8520 $self->{line_prev} = $self->{line};
8521 $self->{column_prev} = $self->{column};
8522 $self->{column}++;
8523 $self->{nc}
8524 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8525 } else {
8526 $self->{set_nc}->($self);
8527 }
8528
8529 return ($self->{ct}); # ELEMENT
8530 redo A;
8531 } elsif ($self->{nc} == -1) {
8532 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8533 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8534 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8535
8536 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8537 $self->{line_prev} = $self->{line};
8538 $self->{column_prev} = $self->{column};
8539 $self->{column}++;
8540 $self->{nc}
8541 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8542 } else {
8543 $self->{set_nc}->($self);
8544 }
8545
8546 return ($self->{ct}); # ELEMENT
8547 redo A;
8548 } else {
8549 $self->{ct}->{content}->[-1] .= chr $self->{nc};
8550 ## Stay in the state.
8551
8552 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8553 $self->{line_prev} = $self->{line};
8554 $self->{column_prev} = $self->{column};
8555 $self->{column}++;
8556 $self->{nc}
8557 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8558 } else {
8559 $self->{set_nc}->($self);
8560 }
8561
8562 redo A;
8563 }
8564 } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8565 if ($is_space->{$self->{nc}}) {
8566 ## Stay in the state.
8567
8568 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8569 $self->{line_prev} = $self->{line};
8570 $self->{column_prev} = $self->{column};
8571 $self->{column}++;
8572 $self->{nc}
8573 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8574 } else {
8575 $self->{set_nc}->($self);
8576 }
8577
8578 redo A;
8579 } elsif ($self->{nc} == 0x007C or # |
8580 $self->{nc} == 0x002C) { # ,
8581 push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8582 $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8583
8584 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8585 $self->{line_prev} = $self->{line};
8586 $self->{column_prev} = $self->{column};
8587 $self->{column}++;
8588 $self->{nc}
8589 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8590 } else {
8591 $self->{set_nc}->($self);
8592 }
8593
8594 redo A;
8595 } elsif ($self->{nc} == 0x0029) { # )
8596 $self->{group_depth}--;
8597 push @{$self->{ct}->{content}}, chr $self->{nc};
8598 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8599
8600 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8601 $self->{line_prev} = $self->{line};
8602 $self->{column_prev} = $self->{column};
8603 $self->{column}++;
8604 $self->{nc}
8605 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8606 } else {
8607 $self->{set_nc}->($self);
8608 }
8609
8610 redo A;
8611 } elsif ($self->{nc} == 0x003E) { # >
8612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8613 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8614 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8615
8616 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617 $self->{line_prev} = $self->{line};
8618 $self->{column_prev} = $self->{column};
8619 $self->{column}++;
8620 $self->{nc}
8621 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622 } else {
8623 $self->{set_nc}->($self);
8624 }
8625
8626 return ($self->{ct}); # ELEMENT
8627 redo A;
8628 } elsif ($self->{nc} == -1) {
8629 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8630 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8631 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8632
8633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8634 $self->{line_prev} = $self->{line};
8635 $self->{column_prev} = $self->{column};
8636 $self->{column}++;
8637 $self->{nc}
8638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8639 } else {
8640 $self->{set_nc}->($self);
8641 }
8642
8643 return ($self->{ct}); # ELEMENT
8644 redo A;
8645 } else {
8646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8647 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8648 $self->{state} = BOGUS_MD_STATE;
8649
8650 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8651 $self->{line_prev} = $self->{line};
8652 $self->{column_prev} = $self->{column};
8653 $self->{column}++;
8654 $self->{nc}
8655 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8656 } else {
8657 $self->{set_nc}->($self);
8658 }
8659
8660 redo A;
8661 }
8662 } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8663 if ($is_space->{$self->{nc}}) {
8664 if ($self->{group_depth}) {
8665 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8666 } else {
8667 $self->{state} = AFTER_MD_DEF_STATE;
8668 }
8669
8670 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8671 $self->{line_prev} = $self->{line};
8672 $self->{column_prev} = $self->{column};
8673 $self->{column}++;
8674 $self->{nc}
8675 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8676 } else {
8677 $self->{set_nc}->($self);
8678 }
8679
8680 redo A;
8681 } elsif ($self->{nc} == 0x002A or # *
8682 $self->{nc} == 0x002B or # +
8683 $self->{nc} == 0x003F) { # ?
8684 push @{$self->{ct}->{content}}, chr $self->{nc};
8685 if ($self->{group_depth}) {
8686 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8687 } else {
8688 $self->{state} = AFTER_MD_DEF_STATE;
8689 }
8690
8691 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8692 $self->{line_prev} = $self->{line};
8693 $self->{column_prev} = $self->{column};
8694 $self->{column}++;
8695 $self->{nc}
8696 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8697 } else {
8698 $self->{set_nc}->($self);
8699 }
8700
8701 redo A;
8702 } elsif ($self->{nc} == 0x0029) { # )
8703 if ($self->{group_depth}) {
8704 $self->{group_depth}--;
8705 push @{$self->{ct}->{content}}, chr $self->{nc};
8706 ## Stay in the state.
8707
8708 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8709 $self->{line_prev} = $self->{line};
8710 $self->{column_prev} = $self->{column};
8711 $self->{column}++;
8712 $self->{nc}
8713 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8714 } else {
8715 $self->{set_nc}->($self);
8716 }
8717
8718 redo A;
8719 } else {
8720 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8721 $self->{state} = BOGUS_MD_STATE;
8722 ## Reconsume.
8723 redo A;
8724 }
8725 } elsif ($self->{nc} == 0x003E) { # >
8726 if ($self->{group_depth}) {
8727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8728 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8729 }
8730 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8731
8732 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8733 $self->{line_prev} = $self->{line};
8734 $self->{column_prev} = $self->{column};
8735 $self->{column}++;
8736 $self->{nc}
8737 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8738 } else {
8739 $self->{set_nc}->($self);
8740 }
8741
8742 return ($self->{ct}); # ELEMENT
8743 redo A;
8744 } elsif ($self->{nc} == -1) {
8745 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8746 push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8747 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8748
8749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8750 $self->{line_prev} = $self->{line};
8751 $self->{column_prev} = $self->{column};
8752 $self->{column}++;
8753 $self->{nc}
8754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8755 } else {
8756 $self->{set_nc}->($self);
8757 }
8758
8759 return ($self->{ct}); # ELEMENT
8760 redo A;
8761 } else {
8762 if ($self->{group_depth}) {
8763 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8764 } else {
8765 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8766 $self->{state} = BOGUS_MD_STATE;
8767 }
8768 ## Reconsume.
8769 redo A;
8770 }
8771 } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8772 if ($is_space->{$self->{nc}}) {
8773 ## Stay in the state.
8774
8775 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8776 $self->{line_prev} = $self->{line};
8777 $self->{column_prev} = $self->{column};
8778 $self->{column}++;
8779 $self->{nc}
8780 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8781 } else {
8782 $self->{set_nc}->($self);
8783 }
8784
8785 redo A;
8786 } elsif ($self->{nc} == 0x003E) { # >
8787 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8788
8789 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8790 $self->{line_prev} = $self->{line};
8791 $self->{column_prev} = $self->{column};
8792 $self->{column}++;
8793 $self->{nc}
8794 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8795 } else {
8796 $self->{set_nc}->($self);
8797 }
8798
8799 return ($self->{ct}); # ENTITY/ELEMENT
8800 redo A;
8801 } elsif ($self->{nc} == -1) {
8802 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8803 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8804
8805 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8806 $self->{line_prev} = $self->{line};
8807 $self->{column_prev} = $self->{column};
8808 $self->{column}++;
8809 $self->{nc}
8810 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8811 } else {
8812 $self->{set_nc}->($self);
8813 }
8814
8815 return ($self->{ct}); # ENTITY/ELEMENT
8816 redo A;
8817 } else {
8818 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8819 $self->{state} = BOGUS_MD_STATE;
8820 ## Reconsume.
8821 redo A;
8822 }
8823 } elsif ($self->{state} == BOGUS_MD_STATE) {
8824 if ($self->{nc} == 0x003E) { # >
8825 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8826
8827 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8828 $self->{line_prev} = $self->{line};
8829 $self->{column_prev} = $self->{column};
8830 $self->{column}++;
8831 $self->{nc}
8832 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8833 } else {
8834 $self->{set_nc}->($self);
8835 }
8836
8837 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8838 redo A;
8839 } elsif ($self->{nc} == -1) {
8840 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8841 ## Reconsume.
8842 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8843 redo A;
8844 } else {
8845 ## Stay in the state.
8846
8847 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8848 $self->{line_prev} = $self->{line};
8849 $self->{column_prev} = $self->{column};
8850 $self->{column}++;
8851 $self->{nc}
8852 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8853 } else {
8854 $self->{set_nc}->($self);
8855 }
8856
8857 redo A;
8858 }
8859 } else {
8860 die "$0: $self->{state}: Unknown state";
8861 }
8862 } # A
8863
8864 die "$0: _get_next_token: unexpected case";
8865 } # _get_next_token
8866
8867 1;
8868 ## $Date: 2009/09/05 09:57:55 $
8869

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24