/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download) (as text)
Tue Oct 14 02:27:58 2008 UTC (16 years ago) by wakaba
Branch: MAIN
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	14 Oct 2008 02:26:16 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile: New rule to make HTML/Tokenizer.pm is added.

	* HTML.pm.src: Tokenizer part moved to another file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 02:25:46 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New file.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3     our $VERSION=do{my @r=(q$Revision: 1.207 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5     package Whatpm::HTML;
6    
7     ## Content model flags
8    
9     sub CM_ENTITY () { 0b001 } # & markup in data
10     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
11     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
12    
13     sub PLAINTEXT_CONTENT_MODEL () { 0 }
14     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
15     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
16     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
17    
18     ## Tokenizer states
19    
20     sub DATA_STATE () { 0 }
21     #sub ENTITY_DATA_STATE () { 1 }
22     sub TAG_OPEN_STATE () { 2 }
23     sub CLOSE_TAG_OPEN_STATE () { 3 }
24     sub TAG_NAME_STATE () { 4 }
25     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
26     sub ATTRIBUTE_NAME_STATE () { 6 }
27     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
28     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
29     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
30     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
31     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
32     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
33     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
34     sub COMMENT_START_STATE () { 14 }
35     sub COMMENT_START_DASH_STATE () { 15 }
36     sub COMMENT_STATE () { 16 }
37     sub COMMENT_END_STATE () { 17 }
38     sub COMMENT_END_DASH_STATE () { 18 }
39     sub BOGUS_COMMENT_STATE () { 19 }
40     sub DOCTYPE_STATE () { 20 }
41     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
42     sub DOCTYPE_NAME_STATE () { 22 }
43     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
44     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
45     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
46     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
47     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
48     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
49     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
50     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
51     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
52     sub BOGUS_DOCTYPE_STATE () { 32 }
53     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
54     sub SELF_CLOSING_START_TAG_STATE () { 34 }
55     sub CDATA_SECTION_STATE () { 35 }
56     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
57     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
58     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
59     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
60     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
61     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
62     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
63     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
64     ## NOTE: "Entity data state", "entity in attribute value state", and
65     ## "consume a character reference" algorithm are jointly implemented
66     ## using the following six states:
67     sub ENTITY_STATE () { 44 }
68     sub ENTITY_HASH_STATE () { 45 }
69     sub NCR_NUM_STATE () { 46 }
70     sub HEXREF_X_STATE () { 47 }
71     sub HEXREF_HEX_STATE () { 48 }
72     sub ENTITY_NAME_STATE () { 49 }
73     sub PCDATA_STATE () { 50 } # "data state" in the spec
74    
75     ## Token types
76    
77     sub DOCTYPE_TOKEN () { 1 }
78     sub COMMENT_TOKEN () { 2 }
79     sub START_TAG_TOKEN () { 3 }
80     sub END_TAG_TOKEN () { 4 }
81     sub END_OF_FILE_TOKEN () { 5 }
82     sub CHARACTER_TOKEN () { 6 }
83    
84     ## Tree constructor state constants (see Whatpm::HTML for the full
85     ## list and descriptions)
86    
87     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
88     sub FOREIGN_EL () { 0b1_00000000000 }
89    
90     ## Character reference mappings
91    
92     my $charref_map = {
93     0x0D => 0x000A,
94     0x80 => 0x20AC,
95     0x81 => 0xFFFD,
96     0x82 => 0x201A,
97     0x83 => 0x0192,
98     0x84 => 0x201E,
99     0x85 => 0x2026,
100     0x86 => 0x2020,
101     0x87 => 0x2021,
102     0x88 => 0x02C6,
103     0x89 => 0x2030,
104     0x8A => 0x0160,
105     0x8B => 0x2039,
106     0x8C => 0x0152,
107     0x8D => 0xFFFD,
108     0x8E => 0x017D,
109     0x8F => 0xFFFD,
110     0x90 => 0xFFFD,
111     0x91 => 0x2018,
112     0x92 => 0x2019,
113     0x93 => 0x201C,
114     0x94 => 0x201D,
115     0x95 => 0x2022,
116     0x96 => 0x2013,
117     0x97 => 0x2014,
118     0x98 => 0x02DC,
119     0x99 => 0x2122,
120     0x9A => 0x0161,
121     0x9B => 0x203A,
122     0x9C => 0x0153,
123     0x9D => 0xFFFD,
124     0x9E => 0x017E,
125     0x9F => 0x0178,
126     }; # $charref_map
127     $charref_map->{$_} = 0xFFFD
128     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
129     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
130     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
131     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
132     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
133     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
134     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
135    
136     ## Implementations MUST act as if state machine in the spec
137    
138     sub _initialize_tokenizer ($) {
139     my $self = shift;
140    
141     ## NOTE: Fields set by |new| constructor:
142     #$self->{level}
143     #$self->{set_nc}
144     #$self->{parse_error}
145    
146     $self->{state} = DATA_STATE; # MUST
147     #$self->{s_kwd}; # state keyword - initialized when used
148     #$self->{entity__value}; # initialized when used
149     #$self->{entity__match}; # initialized when used
150     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
151     undef $self->{ct}; # current token
152     undef $self->{ca}; # current attribute
153     undef $self->{last_stag_name}; # last emitted start tag name
154     #$self->{prev_state}; # initialized when used
155     delete $self->{self_closing};
156     $self->{char_buffer} = '';
157     $self->{char_buffer_pos} = 0;
158     $self->{nc} = -1; # next input character
159     #$self->{next_nc}
160     !!!next-input-character;
161     $self->{token} = [];
162     # $self->{escape}
163     } # _initialize_tokenizer
164    
165     ## A token has:
166     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
167     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
168     ## ->{name} (DOCTYPE_TOKEN)
169     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
170     ## ->{pubid} (DOCTYPE_TOKEN)
171     ## ->{sysid} (DOCTYPE_TOKEN)
172     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
173     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
174     ## ->{name}
175     ## ->{value}
176     ## ->{has_reference} == 1 or 0
177     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
178     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
179     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
180     ## while the token is pushed back to the stack.
181    
182     ## Emitted token MUST immediately be handled by the tree construction state.
183    
184     ## Before each step, UA MAY check to see if either one of the scripts in
185     ## "list of scripts that will execute as soon as possible" or the first
186     ## script in the "list of scripts that will execute asynchronously",
187     ## has completed loading. If one has, then it MUST be executed
188     ## and removed from the list.
189    
190     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
191     ## (This requirement was dropped from HTML5 spec, unfortunately.)
192    
193     my $is_space = {
194     0x0009 => 1, # CHARACTER TABULATION (HT)
195     0x000A => 1, # LINE FEED (LF)
196     #0x000B => 0, # LINE TABULATION (VT)
197     0x000C => 1, # FORM FEED (FF)
198     #0x000D => 1, # CARRIAGE RETURN (CR)
199     0x0020 => 1, # SPACE (SP)
200     };
201    
202     sub _get_next_token ($) {
203     my $self = shift;
204    
205     if ($self->{self_closing}) {
206     !!!parse-error (type => 'nestc', token => $self->{ct});
207     ## NOTE: The |self_closing| flag is only set by start tag token.
208     ## In addition, when a start tag token is emitted, it is always set to
209     ## |ct|.
210     delete $self->{self_closing};
211     }
212    
213     if (@{$self->{token}}) {
214     $self->{self_closing} = $self->{token}->[0]->{self_closing};
215     return shift @{$self->{token}};
216     }
217    
218     A: {
219     if ($self->{state} == PCDATA_STATE) {
220     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
221    
222     if ($self->{nc} == 0x0026) { # &
223     !!!cp (0.1);
224     ## NOTE: In the spec, the tokenizer is switched to the
225     ## "entity data state". In this implementation, the tokenizer
226     ## is switched to the |ENTITY_STATE|, which is an implementation
227     ## of the "consume a character reference" algorithm.
228     $self->{entity_add} = -1;
229     $self->{prev_state} = DATA_STATE;
230     $self->{state} = ENTITY_STATE;
231     !!!next-input-character;
232     redo A;
233     } elsif ($self->{nc} == 0x003C) { # <
234     !!!cp (0.2);
235     $self->{state} = TAG_OPEN_STATE;
236     !!!next-input-character;
237     redo A;
238     } elsif ($self->{nc} == -1) {
239     !!!cp (0.3);
240     !!!emit ({type => END_OF_FILE_TOKEN,
241     line => $self->{line}, column => $self->{column}});
242     last A; ## TODO: ok?
243     } else {
244     !!!cp (0.4);
245     #
246     }
247    
248     # Anything else
249     my $token = {type => CHARACTER_TOKEN,
250     data => chr $self->{nc},
251     line => $self->{line}, column => $self->{column},
252     };
253     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
254    
255     ## Stay in the state.
256     !!!next-input-character;
257     !!!emit ($token);
258     redo A;
259     } elsif ($self->{state} == DATA_STATE) {
260     $self->{s_kwd} = '' unless defined $self->{s_kwd};
261     if ($self->{nc} == 0x0026) { # &
262     $self->{s_kwd} = '';
263     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
264     not $self->{escape}) {
265     !!!cp (1);
266     ## NOTE: In the spec, the tokenizer is switched to the
267     ## "entity data state". In this implementation, the tokenizer
268     ## is switched to the |ENTITY_STATE|, which is an implementation
269     ## of the "consume a character reference" algorithm.
270     $self->{entity_add} = -1;
271     $self->{prev_state} = DATA_STATE;
272     $self->{state} = ENTITY_STATE;
273     !!!next-input-character;
274     redo A;
275     } else {
276     !!!cp (2);
277     #
278     }
279     } elsif ($self->{nc} == 0x002D) { # -
280     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
281     $self->{s_kwd} .= '-';
282    
283     if ($self->{s_kwd} eq '<!--') {
284     !!!cp (3);
285     $self->{escape} = 1; # unless $self->{escape};
286     $self->{s_kwd} = '--';
287     #
288     } elsif ($self->{s_kwd} eq '---') {
289     !!!cp (4);
290     $self->{s_kwd} = '--';
291     #
292     } else {
293     !!!cp (5);
294     #
295     }
296     }
297    
298     #
299     } elsif ($self->{nc} == 0x0021) { # !
300     if (length $self->{s_kwd}) {
301     !!!cp (5.1);
302     $self->{s_kwd} .= '!';
303     #
304     } else {
305     !!!cp (5.2);
306     #$self->{s_kwd} = '';
307     #
308     }
309     #
310     } elsif ($self->{nc} == 0x003C) { # <
311     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
312     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
313     not $self->{escape})) {
314     !!!cp (6);
315     $self->{state} = TAG_OPEN_STATE;
316     !!!next-input-character;
317     redo A;
318     } else {
319     !!!cp (7);
320     $self->{s_kwd} = '';
321     #
322     }
323     } elsif ($self->{nc} == 0x003E) { # >
324     if ($self->{escape} and
325     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
326     if ($self->{s_kwd} eq '--') {
327     !!!cp (8);
328     delete $self->{escape};
329     } else {
330     !!!cp (9);
331     }
332     } else {
333     !!!cp (10);
334     }
335    
336     $self->{s_kwd} = '';
337     #
338     } elsif ($self->{nc} == -1) {
339     !!!cp (11);
340     $self->{s_kwd} = '';
341     !!!emit ({type => END_OF_FILE_TOKEN,
342     line => $self->{line}, column => $self->{column}});
343     last A; ## TODO: ok?
344     } else {
345     !!!cp (12);
346     $self->{s_kwd} = '';
347     #
348     }
349    
350     # Anything else
351     my $token = {type => CHARACTER_TOKEN,
352     data => chr $self->{nc},
353     line => $self->{line}, column => $self->{column},
354     };
355     if ($self->{read_until}->($token->{data}, q[-!<>&],
356     length $token->{data})) {
357     $self->{s_kwd} = '';
358     }
359    
360     ## Stay in the data state.
361     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
362     !!!cp (13);
363     $self->{state} = PCDATA_STATE;
364     } else {
365     !!!cp (14);
366     ## Stay in the state.
367     }
368     !!!next-input-character;
369     !!!emit ($token);
370     redo A;
371     } elsif ($self->{state} == TAG_OPEN_STATE) {
372     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
373     if ($self->{nc} == 0x002F) { # /
374     !!!cp (15);
375     !!!next-input-character;
376     $self->{state} = CLOSE_TAG_OPEN_STATE;
377     redo A;
378     } elsif ($self->{nc} == 0x0021) { # !
379     !!!cp (15.1);
380     $self->{s_kwd} = '<' unless $self->{escape};
381     #
382     } else {
383     !!!cp (16);
384     #
385     }
386    
387     ## reconsume
388     $self->{state} = DATA_STATE;
389     !!!emit ({type => CHARACTER_TOKEN, data => '<',
390     line => $self->{line_prev},
391     column => $self->{column_prev},
392     });
393     redo A;
394     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
395     if ($self->{nc} == 0x0021) { # !
396     !!!cp (17);
397     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
398     !!!next-input-character;
399     redo A;
400     } elsif ($self->{nc} == 0x002F) { # /
401     !!!cp (18);
402     $self->{state} = CLOSE_TAG_OPEN_STATE;
403     !!!next-input-character;
404     redo A;
405     } elsif (0x0041 <= $self->{nc} and
406     $self->{nc} <= 0x005A) { # A..Z
407     !!!cp (19);
408     $self->{ct}
409     = {type => START_TAG_TOKEN,
410     tag_name => chr ($self->{nc} + 0x0020),
411     line => $self->{line_prev},
412     column => $self->{column_prev}};
413     $self->{state} = TAG_NAME_STATE;
414     !!!next-input-character;
415     redo A;
416     } elsif (0x0061 <= $self->{nc} and
417     $self->{nc} <= 0x007A) { # a..z
418     !!!cp (20);
419     $self->{ct} = {type => START_TAG_TOKEN,
420     tag_name => chr ($self->{nc}),
421     line => $self->{line_prev},
422     column => $self->{column_prev}};
423     $self->{state} = TAG_NAME_STATE;
424     !!!next-input-character;
425     redo A;
426     } elsif ($self->{nc} == 0x003E) { # >
427     !!!cp (21);
428     !!!parse-error (type => 'empty start tag',
429     line => $self->{line_prev},
430     column => $self->{column_prev});
431     $self->{state} = DATA_STATE;
432     !!!next-input-character;
433    
434     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
435     line => $self->{line_prev},
436     column => $self->{column_prev},
437     });
438    
439     redo A;
440     } elsif ($self->{nc} == 0x003F) { # ?
441     !!!cp (22);
442     !!!parse-error (type => 'pio',
443     line => $self->{line_prev},
444     column => $self->{column_prev});
445     $self->{state} = BOGUS_COMMENT_STATE;
446     $self->{ct} = {type => COMMENT_TOKEN, data => '',
447     line => $self->{line_prev},
448     column => $self->{column_prev},
449     };
450     ## $self->{nc} is intentionally left as is
451     redo A;
452     } else {
453     !!!cp (23);
454     !!!parse-error (type => 'bare stago',
455     line => $self->{line_prev},
456     column => $self->{column_prev});
457     $self->{state} = DATA_STATE;
458     ## reconsume
459    
460     !!!emit ({type => CHARACTER_TOKEN, data => '<',
461     line => $self->{line_prev},
462     column => $self->{column_prev},
463     });
464    
465     redo A;
466     }
467     } else {
468     die "$0: $self->{content_model} in tag open";
469     }
470     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
471     ## NOTE: The "close tag open state" in the spec is implemented as
472     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
473    
474     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
475     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
476     if (defined $self->{last_stag_name}) {
477     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
478     $self->{s_kwd} = '';
479     ## Reconsume.
480     redo A;
481     } else {
482     ## No start tag token has ever been emitted
483     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
484     !!!cp (28);
485     $self->{state} = DATA_STATE;
486     ## Reconsume.
487     !!!emit ({type => CHARACTER_TOKEN, data => '</',
488     line => $l, column => $c,
489     });
490     redo A;
491     }
492     }
493    
494     if (0x0041 <= $self->{nc} and
495     $self->{nc} <= 0x005A) { # A..Z
496     !!!cp (29);
497     $self->{ct}
498     = {type => END_TAG_TOKEN,
499     tag_name => chr ($self->{nc} + 0x0020),
500     line => $l, column => $c};
501     $self->{state} = TAG_NAME_STATE;
502     !!!next-input-character;
503     redo A;
504     } elsif (0x0061 <= $self->{nc} and
505     $self->{nc} <= 0x007A) { # a..z
506     !!!cp (30);
507     $self->{ct} = {type => END_TAG_TOKEN,
508     tag_name => chr ($self->{nc}),
509     line => $l, column => $c};
510     $self->{state} = TAG_NAME_STATE;
511     !!!next-input-character;
512     redo A;
513     } elsif ($self->{nc} == 0x003E) { # >
514     !!!cp (31);
515     !!!parse-error (type => 'empty end tag',
516     line => $self->{line_prev}, ## "<" in "</>"
517     column => $self->{column_prev} - 1);
518     $self->{state} = DATA_STATE;
519     !!!next-input-character;
520     redo A;
521     } elsif ($self->{nc} == -1) {
522     !!!cp (32);
523     !!!parse-error (type => 'bare etago');
524     $self->{state} = DATA_STATE;
525     # reconsume
526    
527     !!!emit ({type => CHARACTER_TOKEN, data => '</',
528     line => $l, column => $c,
529     });
530    
531     redo A;
532     } else {
533     !!!cp (33);
534     !!!parse-error (type => 'bogus end tag');
535     $self->{state} = BOGUS_COMMENT_STATE;
536     $self->{ct} = {type => COMMENT_TOKEN, data => '',
537     line => $self->{line_prev}, # "<" of "</"
538     column => $self->{column_prev} - 1,
539     };
540     ## NOTE: $self->{nc} is intentionally left as is.
541     ## Although the "anything else" case of the spec not explicitly
542     ## states that the next input character is to be reconsumed,
543     ## it will be included to the |data| of the comment token
544     ## generated from the bogus end tag, as defined in the
545     ## "bogus comment state" entry.
546     redo A;
547     }
548     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
549     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
550     if (length $ch) {
551     my $CH = $ch;
552     $ch =~ tr/a-z/A-Z/;
553     my $nch = chr $self->{nc};
554     if ($nch eq $ch or $nch eq $CH) {
555     !!!cp (24);
556     ## Stay in the state.
557     $self->{s_kwd} .= $nch;
558     !!!next-input-character;
559     redo A;
560     } else {
561     !!!cp (25);
562     $self->{state} = DATA_STATE;
563     ## Reconsume.
564     !!!emit ({type => CHARACTER_TOKEN,
565     data => '</' . $self->{s_kwd},
566     line => $self->{line_prev},
567     column => $self->{column_prev} - 1 - length $self->{s_kwd},
568     });
569     redo A;
570     }
571     } else { # after "<{tag-name}"
572     unless ($is_space->{$self->{nc}} or
573     {
574     0x003E => 1, # >
575     0x002F => 1, # /
576     -1 => 1, # EOF
577     }->{$self->{nc}}) {
578     !!!cp (26);
579     ## Reconsume.
580     $self->{state} = DATA_STATE;
581     !!!emit ({type => CHARACTER_TOKEN,
582     data => '</' . $self->{s_kwd},
583     line => $self->{line_prev},
584     column => $self->{column_prev} - 1 - length $self->{s_kwd},
585     });
586     redo A;
587     } else {
588     !!!cp (27);
589     $self->{ct}
590     = {type => END_TAG_TOKEN,
591     tag_name => $self->{last_stag_name},
592     line => $self->{line_prev},
593     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
594     $self->{state} = TAG_NAME_STATE;
595     ## Reconsume.
596     redo A;
597     }
598     }
599     } elsif ($self->{state} == TAG_NAME_STATE) {
600     if ($is_space->{$self->{nc}}) {
601     !!!cp (34);
602     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
603     !!!next-input-character;
604     redo A;
605     } elsif ($self->{nc} == 0x003E) { # >
606     if ($self->{ct}->{type} == START_TAG_TOKEN) {
607     !!!cp (35);
608     $self->{last_stag_name} = $self->{ct}->{tag_name};
609     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
610     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
611     #if ($self->{ct}->{attributes}) {
612     # ## NOTE: This should never be reached.
613     # !!! cp (36);
614     # !!! parse-error (type => 'end tag attribute');
615     #} else {
616     !!!cp (37);
617     #}
618     } else {
619     die "$0: $self->{ct}->{type}: Unknown token type";
620     }
621     $self->{state} = DATA_STATE;
622     !!!next-input-character;
623    
624     !!!emit ($self->{ct}); # start tag or end tag
625    
626     redo A;
627     } elsif (0x0041 <= $self->{nc} and
628     $self->{nc} <= 0x005A) { # A..Z
629     !!!cp (38);
630     $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
631     # start tag or end tag
632     ## Stay in this state
633     !!!next-input-character;
634     redo A;
635     } elsif ($self->{nc} == -1) {
636     !!!parse-error (type => 'unclosed tag');
637     if ($self->{ct}->{type} == START_TAG_TOKEN) {
638     !!!cp (39);
639     $self->{last_stag_name} = $self->{ct}->{tag_name};
640     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
641     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
642     #if ($self->{ct}->{attributes}) {
643     # ## NOTE: This state should never be reached.
644     # !!! cp (40);
645     # !!! parse-error (type => 'end tag attribute');
646     #} else {
647     !!!cp (41);
648     #}
649     } else {
650     die "$0: $self->{ct}->{type}: Unknown token type";
651     }
652     $self->{state} = DATA_STATE;
653     # reconsume
654    
655     !!!emit ($self->{ct}); # start tag or end tag
656    
657     redo A;
658     } elsif ($self->{nc} == 0x002F) { # /
659     !!!cp (42);
660     $self->{state} = SELF_CLOSING_START_TAG_STATE;
661     !!!next-input-character;
662     redo A;
663     } else {
664     !!!cp (44);
665     $self->{ct}->{tag_name} .= chr $self->{nc};
666     # start tag or end tag
667     ## Stay in the state
668     !!!next-input-character;
669     redo A;
670     }
671     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
672     if ($is_space->{$self->{nc}}) {
673     !!!cp (45);
674     ## Stay in the state
675     !!!next-input-character;
676     redo A;
677     } elsif ($self->{nc} == 0x003E) { # >
678     if ($self->{ct}->{type} == START_TAG_TOKEN) {
679     !!!cp (46);
680     $self->{last_stag_name} = $self->{ct}->{tag_name};
681     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
682     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
683     if ($self->{ct}->{attributes}) {
684     !!!cp (47);
685     !!!parse-error (type => 'end tag attribute');
686     } else {
687     !!!cp (48);
688     }
689     } else {
690     die "$0: $self->{ct}->{type}: Unknown token type";
691     }
692     $self->{state} = DATA_STATE;
693     !!!next-input-character;
694    
695     !!!emit ($self->{ct}); # start tag or end tag
696    
697     redo A;
698     } elsif (0x0041 <= $self->{nc} and
699     $self->{nc} <= 0x005A) { # A..Z
700     !!!cp (49);
701     $self->{ca}
702     = {name => chr ($self->{nc} + 0x0020),
703     value => '',
704     line => $self->{line}, column => $self->{column}};
705     $self->{state} = ATTRIBUTE_NAME_STATE;
706     !!!next-input-character;
707     redo A;
708     } elsif ($self->{nc} == 0x002F) { # /
709     !!!cp (50);
710     $self->{state} = SELF_CLOSING_START_TAG_STATE;
711     !!!next-input-character;
712     redo A;
713     } elsif ($self->{nc} == -1) {
714     !!!parse-error (type => 'unclosed tag');
715     if ($self->{ct}->{type} == START_TAG_TOKEN) {
716     !!!cp (52);
717     $self->{last_stag_name} = $self->{ct}->{tag_name};
718     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
719     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
720     if ($self->{ct}->{attributes}) {
721     !!!cp (53);
722     !!!parse-error (type => 'end tag attribute');
723     } else {
724     !!!cp (54);
725     }
726     } else {
727     die "$0: $self->{ct}->{type}: Unknown token type";
728     }
729     $self->{state} = DATA_STATE;
730     # reconsume
731    
732     !!!emit ($self->{ct}); # start tag or end tag
733    
734     redo A;
735     } else {
736     if ({
737     0x0022 => 1, # "
738     0x0027 => 1, # '
739     0x003D => 1, # =
740     }->{$self->{nc}}) {
741     !!!cp (55);
742     !!!parse-error (type => 'bad attribute name');
743     } else {
744     !!!cp (56);
745     }
746     $self->{ca}
747     = {name => chr ($self->{nc}),
748     value => '',
749     line => $self->{line}, column => $self->{column}};
750     $self->{state} = ATTRIBUTE_NAME_STATE;
751     !!!next-input-character;
752     redo A;
753     }
754     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
755     my $before_leave = sub {
756     if (exists $self->{ct}->{attributes} # start tag or end tag
757     ->{$self->{ca}->{name}}) { # MUST
758     !!!cp (57);
759     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
760     ## Discard $self->{ca} # MUST
761     } else {
762     !!!cp (58);
763     $self->{ct}->{attributes}->{$self->{ca}->{name}}
764     = $self->{ca};
765     }
766     }; # $before_leave
767    
768     if ($is_space->{$self->{nc}}) {
769     !!!cp (59);
770     $before_leave->();
771     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
772     !!!next-input-character;
773     redo A;
774     } elsif ($self->{nc} == 0x003D) { # =
775     !!!cp (60);
776     $before_leave->();
777     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
778     !!!next-input-character;
779     redo A;
780     } elsif ($self->{nc} == 0x003E) { # >
781     $before_leave->();
782     if ($self->{ct}->{type} == START_TAG_TOKEN) {
783     !!!cp (61);
784     $self->{last_stag_name} = $self->{ct}->{tag_name};
785     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
786     !!!cp (62);
787     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
788     if ($self->{ct}->{attributes}) {
789     !!!parse-error (type => 'end tag attribute');
790     }
791     } else {
792     die "$0: $self->{ct}->{type}: Unknown token type";
793     }
794     $self->{state} = DATA_STATE;
795     !!!next-input-character;
796    
797     !!!emit ($self->{ct}); # start tag or end tag
798    
799     redo A;
800     } elsif (0x0041 <= $self->{nc} and
801     $self->{nc} <= 0x005A) { # A..Z
802     !!!cp (63);
803     $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
804     ## Stay in the state
805     !!!next-input-character;
806     redo A;
807     } elsif ($self->{nc} == 0x002F) { # /
808     !!!cp (64);
809     $before_leave->();
810     $self->{state} = SELF_CLOSING_START_TAG_STATE;
811     !!!next-input-character;
812     redo A;
813     } elsif ($self->{nc} == -1) {
814     !!!parse-error (type => 'unclosed tag');
815     $before_leave->();
816     if ($self->{ct}->{type} == START_TAG_TOKEN) {
817     !!!cp (66);
818     $self->{last_stag_name} = $self->{ct}->{tag_name};
819     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
820     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
821     if ($self->{ct}->{attributes}) {
822     !!!cp (67);
823     !!!parse-error (type => 'end tag attribute');
824     } else {
825     ## NOTE: This state should never be reached.
826     !!!cp (68);
827     }
828     } else {
829     die "$0: $self->{ct}->{type}: Unknown token type";
830     }
831     $self->{state} = DATA_STATE;
832     # reconsume
833    
834     !!!emit ($self->{ct}); # start tag or end tag
835    
836     redo A;
837     } else {
838     if ($self->{nc} == 0x0022 or # "
839     $self->{nc} == 0x0027) { # '
840     !!!cp (69);
841     !!!parse-error (type => 'bad attribute name');
842     } else {
843     !!!cp (70);
844     }
845     $self->{ca}->{name} .= chr ($self->{nc});
846     ## Stay in the state
847     !!!next-input-character;
848     redo A;
849     }
850     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
851     if ($is_space->{$self->{nc}}) {
852     !!!cp (71);
853     ## Stay in the state
854     !!!next-input-character;
855     redo A;
856     } elsif ($self->{nc} == 0x003D) { # =
857     !!!cp (72);
858     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
859     !!!next-input-character;
860     redo A;
861     } elsif ($self->{nc} == 0x003E) { # >
862     if ($self->{ct}->{type} == START_TAG_TOKEN) {
863     !!!cp (73);
864     $self->{last_stag_name} = $self->{ct}->{tag_name};
865     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
866     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
867     if ($self->{ct}->{attributes}) {
868     !!!cp (74);
869     !!!parse-error (type => 'end tag attribute');
870     } else {
871     ## NOTE: This state should never be reached.
872     !!!cp (75);
873     }
874     } else {
875     die "$0: $self->{ct}->{type}: Unknown token type";
876     }
877     $self->{state} = DATA_STATE;
878     !!!next-input-character;
879    
880     !!!emit ($self->{ct}); # start tag or end tag
881    
882     redo A;
883     } elsif (0x0041 <= $self->{nc} and
884     $self->{nc} <= 0x005A) { # A..Z
885     !!!cp (76);
886     $self->{ca}
887     = {name => chr ($self->{nc} + 0x0020),
888     value => '',
889     line => $self->{line}, column => $self->{column}};
890     $self->{state} = ATTRIBUTE_NAME_STATE;
891     !!!next-input-character;
892     redo A;
893     } elsif ($self->{nc} == 0x002F) { # /
894     !!!cp (77);
895     $self->{state} = SELF_CLOSING_START_TAG_STATE;
896     !!!next-input-character;
897     redo A;
898     } elsif ($self->{nc} == -1) {
899     !!!parse-error (type => 'unclosed tag');
900     if ($self->{ct}->{type} == START_TAG_TOKEN) {
901     !!!cp (79);
902     $self->{last_stag_name} = $self->{ct}->{tag_name};
903     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
904     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
905     if ($self->{ct}->{attributes}) {
906     !!!cp (80);
907     !!!parse-error (type => 'end tag attribute');
908     } else {
909     ## NOTE: This state should never be reached.
910     !!!cp (81);
911     }
912     } else {
913     die "$0: $self->{ct}->{type}: Unknown token type";
914     }
915     $self->{state} = DATA_STATE;
916     # reconsume
917    
918     !!!emit ($self->{ct}); # start tag or end tag
919    
920     redo A;
921     } else {
922     if ($self->{nc} == 0x0022 or # "
923     $self->{nc} == 0x0027) { # '
924     !!!cp (78);
925     !!!parse-error (type => 'bad attribute name');
926     } else {
927     !!!cp (82);
928     }
929     $self->{ca}
930     = {name => chr ($self->{nc}),
931     value => '',
932     line => $self->{line}, column => $self->{column}};
933     $self->{state} = ATTRIBUTE_NAME_STATE;
934     !!!next-input-character;
935     redo A;
936     }
937     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
938     if ($is_space->{$self->{nc}}) {
939     !!!cp (83);
940     ## Stay in the state
941     !!!next-input-character;
942     redo A;
943     } elsif ($self->{nc} == 0x0022) { # "
944     !!!cp (84);
945     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
946     !!!next-input-character;
947     redo A;
948     } elsif ($self->{nc} == 0x0026) { # &
949     !!!cp (85);
950     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
951     ## reconsume
952     redo A;
953     } elsif ($self->{nc} == 0x0027) { # '
954     !!!cp (86);
955     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
956     !!!next-input-character;
957     redo A;
958     } elsif ($self->{nc} == 0x003E) { # >
959     !!!parse-error (type => 'empty unquoted attribute value');
960     if ($self->{ct}->{type} == START_TAG_TOKEN) {
961     !!!cp (87);
962     $self->{last_stag_name} = $self->{ct}->{tag_name};
963     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
964     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
965     if ($self->{ct}->{attributes}) {
966     !!!cp (88);
967     !!!parse-error (type => 'end tag attribute');
968     } else {
969     ## NOTE: This state should never be reached.
970     !!!cp (89);
971     }
972     } else {
973     die "$0: $self->{ct}->{type}: Unknown token type";
974     }
975     $self->{state} = DATA_STATE;
976     !!!next-input-character;
977    
978     !!!emit ($self->{ct}); # start tag or end tag
979    
980     redo A;
981     } elsif ($self->{nc} == -1) {
982     !!!parse-error (type => 'unclosed tag');
983     if ($self->{ct}->{type} == START_TAG_TOKEN) {
984     !!!cp (90);
985     $self->{last_stag_name} = $self->{ct}->{tag_name};
986     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
987     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
988     if ($self->{ct}->{attributes}) {
989     !!!cp (91);
990     !!!parse-error (type => 'end tag attribute');
991     } else {
992     ## NOTE: This state should never be reached.
993     !!!cp (92);
994     }
995     } else {
996     die "$0: $self->{ct}->{type}: Unknown token type";
997     }
998     $self->{state} = DATA_STATE;
999     ## reconsume
1000    
1001     !!!emit ($self->{ct}); # start tag or end tag
1002    
1003     redo A;
1004     } else {
1005     if ($self->{nc} == 0x003D) { # =
1006     !!!cp (93);
1007     !!!parse-error (type => 'bad attribute value');
1008     } else {
1009     !!!cp (94);
1010     }
1011     $self->{ca}->{value} .= chr ($self->{nc});
1012     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1013     !!!next-input-character;
1014     redo A;
1015     }
1016     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1017     if ($self->{nc} == 0x0022) { # "
1018     !!!cp (95);
1019     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1020     !!!next-input-character;
1021     redo A;
1022     } elsif ($self->{nc} == 0x0026) { # &
1023     !!!cp (96);
1024     ## NOTE: In the spec, the tokenizer is switched to the
1025     ## "entity in attribute value state". In this implementation, the
1026     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1027     ## implementation of the "consume a character reference" algorithm.
1028     $self->{prev_state} = $self->{state};
1029     $self->{entity_add} = 0x0022; # "
1030     $self->{state} = ENTITY_STATE;
1031     !!!next-input-character;
1032     redo A;
1033     } elsif ($self->{nc} == -1) {
1034     !!!parse-error (type => 'unclosed attribute value');
1035     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1036     !!!cp (97);
1037     $self->{last_stag_name} = $self->{ct}->{tag_name};
1038     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1039     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1040     if ($self->{ct}->{attributes}) {
1041     !!!cp (98);
1042     !!!parse-error (type => 'end tag attribute');
1043     } else {
1044     ## NOTE: This state should never be reached.
1045     !!!cp (99);
1046     }
1047     } else {
1048     die "$0: $self->{ct}->{type}: Unknown token type";
1049     }
1050     $self->{state} = DATA_STATE;
1051     ## reconsume
1052    
1053     !!!emit ($self->{ct}); # start tag or end tag
1054    
1055     redo A;
1056     } else {
1057     !!!cp (100);
1058     $self->{ca}->{value} .= chr ($self->{nc});
1059     $self->{read_until}->($self->{ca}->{value},
1060     q["&],
1061     length $self->{ca}->{value});
1062    
1063     ## Stay in the state
1064     !!!next-input-character;
1065     redo A;
1066     }
1067     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1068     if ($self->{nc} == 0x0027) { # '
1069     !!!cp (101);
1070     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1071     !!!next-input-character;
1072     redo A;
1073     } elsif ($self->{nc} == 0x0026) { # &
1074     !!!cp (102);
1075     ## NOTE: In the spec, the tokenizer is switched to the
1076     ## "entity in attribute value state". In this implementation, the
1077     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1078     ## implementation of the "consume a character reference" algorithm.
1079     $self->{entity_add} = 0x0027; # '
1080     $self->{prev_state} = $self->{state};
1081     $self->{state} = ENTITY_STATE;
1082     !!!next-input-character;
1083     redo A;
1084     } elsif ($self->{nc} == -1) {
1085     !!!parse-error (type => 'unclosed attribute value');
1086     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1087     !!!cp (103);
1088     $self->{last_stag_name} = $self->{ct}->{tag_name};
1089     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1090     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091     if ($self->{ct}->{attributes}) {
1092     !!!cp (104);
1093     !!!parse-error (type => 'end tag attribute');
1094     } else {
1095     ## NOTE: This state should never be reached.
1096     !!!cp (105);
1097     }
1098     } else {
1099     die "$0: $self->{ct}->{type}: Unknown token type";
1100     }
1101     $self->{state} = DATA_STATE;
1102     ## reconsume
1103    
1104     !!!emit ($self->{ct}); # start tag or end tag
1105    
1106     redo A;
1107     } else {
1108     !!!cp (106);
1109     $self->{ca}->{value} .= chr ($self->{nc});
1110     $self->{read_until}->($self->{ca}->{value},
1111     q['&],
1112     length $self->{ca}->{value});
1113    
1114     ## Stay in the state
1115     !!!next-input-character;
1116     redo A;
1117     }
1118     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1119     if ($is_space->{$self->{nc}}) {
1120     !!!cp (107);
1121     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1122     !!!next-input-character;
1123     redo A;
1124     } elsif ($self->{nc} == 0x0026) { # &
1125     !!!cp (108);
1126     ## NOTE: In the spec, the tokenizer is switched to the
1127     ## "entity in attribute value state". In this implementation, the
1128     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1129     ## implementation of the "consume a character reference" algorithm.
1130     $self->{entity_add} = -1;
1131     $self->{prev_state} = $self->{state};
1132     $self->{state} = ENTITY_STATE;
1133     !!!next-input-character;
1134     redo A;
1135     } elsif ($self->{nc} == 0x003E) { # >
1136     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1137     !!!cp (109);
1138     $self->{last_stag_name} = $self->{ct}->{tag_name};
1139     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1140     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1141     if ($self->{ct}->{attributes}) {
1142     !!!cp (110);
1143     !!!parse-error (type => 'end tag attribute');
1144     } else {
1145     ## NOTE: This state should never be reached.
1146     !!!cp (111);
1147     }
1148     } else {
1149     die "$0: $self->{ct}->{type}: Unknown token type";
1150     }
1151     $self->{state} = DATA_STATE;
1152     !!!next-input-character;
1153    
1154     !!!emit ($self->{ct}); # start tag or end tag
1155    
1156     redo A;
1157     } elsif ($self->{nc} == -1) {
1158     !!!parse-error (type => 'unclosed tag');
1159     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1160     !!!cp (112);
1161     $self->{last_stag_name} = $self->{ct}->{tag_name};
1162     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1163     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1164     if ($self->{ct}->{attributes}) {
1165     !!!cp (113);
1166     !!!parse-error (type => 'end tag attribute');
1167     } else {
1168     ## NOTE: This state should never be reached.
1169     !!!cp (114);
1170     }
1171     } else {
1172     die "$0: $self->{ct}->{type}: Unknown token type";
1173     }
1174     $self->{state} = DATA_STATE;
1175     ## reconsume
1176    
1177     !!!emit ($self->{ct}); # start tag or end tag
1178    
1179     redo A;
1180     } else {
1181     if ({
1182     0x0022 => 1, # "
1183     0x0027 => 1, # '
1184     0x003D => 1, # =
1185     }->{$self->{nc}}) {
1186     !!!cp (115);
1187     !!!parse-error (type => 'bad attribute value');
1188     } else {
1189     !!!cp (116);
1190     }
1191     $self->{ca}->{value} .= chr ($self->{nc});
1192     $self->{read_until}->($self->{ca}->{value},
1193     q["'=& >],
1194     length $self->{ca}->{value});
1195    
1196     ## Stay in the state
1197     !!!next-input-character;
1198     redo A;
1199     }
1200     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1201     if ($is_space->{$self->{nc}}) {
1202     !!!cp (118);
1203     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1204     !!!next-input-character;
1205     redo A;
1206     } elsif ($self->{nc} == 0x003E) { # >
1207     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1208     !!!cp (119);
1209     $self->{last_stag_name} = $self->{ct}->{tag_name};
1210     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1211     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1212     if ($self->{ct}->{attributes}) {
1213     !!!cp (120);
1214     !!!parse-error (type => 'end tag attribute');
1215     } else {
1216     ## NOTE: This state should never be reached.
1217     !!!cp (121);
1218     }
1219     } else {
1220     die "$0: $self->{ct}->{type}: Unknown token type";
1221     }
1222     $self->{state} = DATA_STATE;
1223     !!!next-input-character;
1224    
1225     !!!emit ($self->{ct}); # start tag or end tag
1226    
1227     redo A;
1228     } elsif ($self->{nc} == 0x002F) { # /
1229     !!!cp (122);
1230     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1231     !!!next-input-character;
1232     redo A;
1233     } elsif ($self->{nc} == -1) {
1234     !!!parse-error (type => 'unclosed tag');
1235     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1236     !!!cp (122.3);
1237     $self->{last_stag_name} = $self->{ct}->{tag_name};
1238     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1239     if ($self->{ct}->{attributes}) {
1240     !!!cp (122.1);
1241     !!!parse-error (type => 'end tag attribute');
1242     } else {
1243     ## NOTE: This state should never be reached.
1244     !!!cp (122.2);
1245     }
1246     } else {
1247     die "$0: $self->{ct}->{type}: Unknown token type";
1248     }
1249     $self->{state} = DATA_STATE;
1250     ## Reconsume.
1251     !!!emit ($self->{ct}); # start tag or end tag
1252     redo A;
1253     } else {
1254     !!!cp ('124.1');
1255     !!!parse-error (type => 'no space between attributes');
1256     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1257     ## reconsume
1258     redo A;
1259     }
1260     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1261     if ($self->{nc} == 0x003E) { # >
1262     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1263     !!!cp ('124.2');
1264     !!!parse-error (type => 'nestc', token => $self->{ct});
1265     ## TODO: Different type than slash in start tag
1266     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1267     if ($self->{ct}->{attributes}) {
1268     !!!cp ('124.4');
1269     !!!parse-error (type => 'end tag attribute');
1270     } else {
1271     !!!cp ('124.5');
1272     }
1273     ## TODO: Test |<title></title/>|
1274     } else {
1275     !!!cp ('124.3');
1276     $self->{self_closing} = 1;
1277     }
1278    
1279     $self->{state} = DATA_STATE;
1280     !!!next-input-character;
1281    
1282     !!!emit ($self->{ct}); # start tag or end tag
1283    
1284     redo A;
1285     } elsif ($self->{nc} == -1) {
1286     !!!parse-error (type => 'unclosed tag');
1287     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1288     !!!cp (124.7);
1289     $self->{last_stag_name} = $self->{ct}->{tag_name};
1290     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1291     if ($self->{ct}->{attributes}) {
1292     !!!cp (124.5);
1293     !!!parse-error (type => 'end tag attribute');
1294     } else {
1295     ## NOTE: This state should never be reached.
1296     !!!cp (124.6);
1297     }
1298     } else {
1299     die "$0: $self->{ct}->{type}: Unknown token type";
1300     }
1301     $self->{state} = DATA_STATE;
1302     ## Reconsume.
1303     !!!emit ($self->{ct}); # start tag or end tag
1304     redo A;
1305     } else {
1306     !!!cp ('124.4');
1307     !!!parse-error (type => 'nestc');
1308     ## TODO: This error type is wrong.
1309     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1310     ## Reconsume.
1311     redo A;
1312     }
1313     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1314     ## (only happen if PCDATA state)
1315    
1316     ## NOTE: Unlike spec's "bogus comment state", this implementation
1317     ## consumes characters one-by-one basis.
1318    
1319     if ($self->{nc} == 0x003E) { # >
1320     !!!cp (124);
1321     $self->{state} = DATA_STATE;
1322     !!!next-input-character;
1323    
1324     !!!emit ($self->{ct}); # comment
1325     redo A;
1326     } elsif ($self->{nc} == -1) {
1327     !!!cp (125);
1328     $self->{state} = DATA_STATE;
1329     ## reconsume
1330    
1331     !!!emit ($self->{ct}); # comment
1332     redo A;
1333     } else {
1334     !!!cp (126);
1335     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1336     $self->{read_until}->($self->{ct}->{data},
1337     q[>],
1338     length $self->{ct}->{data});
1339    
1340     ## Stay in the state.
1341     !!!next-input-character;
1342     redo A;
1343     }
1344     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1345     ## (only happen if PCDATA state)
1346    
1347     if ($self->{nc} == 0x002D) { # -
1348     !!!cp (133);
1349     $self->{state} = MD_HYPHEN_STATE;
1350     !!!next-input-character;
1351     redo A;
1352     } elsif ($self->{nc} == 0x0044 or # D
1353     $self->{nc} == 0x0064) { # d
1354     ## ASCII case-insensitive.
1355     !!!cp (130);
1356     $self->{state} = MD_DOCTYPE_STATE;
1357     $self->{s_kwd} = chr $self->{nc};
1358     !!!next-input-character;
1359     redo A;
1360     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1361     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1362     $self->{nc} == 0x005B) { # [
1363     !!!cp (135.4);
1364     $self->{state} = MD_CDATA_STATE;
1365     $self->{s_kwd} = '[';
1366     !!!next-input-character;
1367     redo A;
1368     } else {
1369     !!!cp (136);
1370     }
1371    
1372     !!!parse-error (type => 'bogus comment',
1373     line => $self->{line_prev},
1374     column => $self->{column_prev} - 1);
1375     ## Reconsume.
1376     $self->{state} = BOGUS_COMMENT_STATE;
1377     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1378     line => $self->{line_prev},
1379     column => $self->{column_prev} - 1,
1380     };
1381     redo A;
1382     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1383     if ($self->{nc} == 0x002D) { # -
1384     !!!cp (127);
1385     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1386     line => $self->{line_prev},
1387     column => $self->{column_prev} - 2,
1388     };
1389     $self->{state} = COMMENT_START_STATE;
1390     !!!next-input-character;
1391     redo A;
1392     } else {
1393     !!!cp (128);
1394     !!!parse-error (type => 'bogus comment',
1395     line => $self->{line_prev},
1396     column => $self->{column_prev} - 2);
1397     $self->{state} = BOGUS_COMMENT_STATE;
1398     ## Reconsume.
1399     $self->{ct} = {type => COMMENT_TOKEN,
1400     data => '-',
1401     line => $self->{line_prev},
1402     column => $self->{column_prev} - 2,
1403     };
1404     redo A;
1405     }
1406     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1407     ## ASCII case-insensitive.
1408     if ($self->{nc} == [
1409     undef,
1410     0x004F, # O
1411     0x0043, # C
1412     0x0054, # T
1413     0x0059, # Y
1414     0x0050, # P
1415     ]->[length $self->{s_kwd}] or
1416     $self->{nc} == [
1417     undef,
1418     0x006F, # o
1419     0x0063, # c
1420     0x0074, # t
1421     0x0079, # y
1422     0x0070, # p
1423     ]->[length $self->{s_kwd}]) {
1424     !!!cp (131);
1425     ## Stay in the state.
1426     $self->{s_kwd} .= chr $self->{nc};
1427     !!!next-input-character;
1428     redo A;
1429     } elsif ((length $self->{s_kwd}) == 6 and
1430     ($self->{nc} == 0x0045 or # E
1431     $self->{nc} == 0x0065)) { # e
1432     !!!cp (129);
1433     $self->{state} = DOCTYPE_STATE;
1434     $self->{ct} = {type => DOCTYPE_TOKEN,
1435     quirks => 1,
1436     line => $self->{line_prev},
1437     column => $self->{column_prev} - 7,
1438     };
1439     !!!next-input-character;
1440     redo A;
1441     } else {
1442     !!!cp (132);
1443     !!!parse-error (type => 'bogus comment',
1444     line => $self->{line_prev},
1445     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1446     $self->{state} = BOGUS_COMMENT_STATE;
1447     ## Reconsume.
1448     $self->{ct} = {type => COMMENT_TOKEN,
1449     data => $self->{s_kwd},
1450     line => $self->{line_prev},
1451     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1452     };
1453     redo A;
1454     }
1455     } elsif ($self->{state} == MD_CDATA_STATE) {
1456     if ($self->{nc} == {
1457     '[' => 0x0043, # C
1458     '[C' => 0x0044, # D
1459     '[CD' => 0x0041, # A
1460     '[CDA' => 0x0054, # T
1461     '[CDAT' => 0x0041, # A
1462     }->{$self->{s_kwd}}) {
1463     !!!cp (135.1);
1464     ## Stay in the state.
1465     $self->{s_kwd} .= chr $self->{nc};
1466     !!!next-input-character;
1467     redo A;
1468     } elsif ($self->{s_kwd} eq '[CDATA' and
1469     $self->{nc} == 0x005B) { # [
1470     !!!cp (135.2);
1471     $self->{ct} = {type => CHARACTER_TOKEN,
1472     data => '',
1473     line => $self->{line_prev},
1474     column => $self->{column_prev} - 7};
1475     $self->{state} = CDATA_SECTION_STATE;
1476     !!!next-input-character;
1477     redo A;
1478     } else {
1479     !!!cp (135.3);
1480     !!!parse-error (type => 'bogus comment',
1481     line => $self->{line_prev},
1482     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1483     $self->{state} = BOGUS_COMMENT_STATE;
1484     ## Reconsume.
1485     $self->{ct} = {type => COMMENT_TOKEN,
1486     data => $self->{s_kwd},
1487     line => $self->{line_prev},
1488     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1489     };
1490     redo A;
1491     }
1492     } elsif ($self->{state} == COMMENT_START_STATE) {
1493     if ($self->{nc} == 0x002D) { # -
1494     !!!cp (137);
1495     $self->{state} = COMMENT_START_DASH_STATE;
1496     !!!next-input-character;
1497     redo A;
1498     } elsif ($self->{nc} == 0x003E) { # >
1499     !!!cp (138);
1500     !!!parse-error (type => 'bogus comment');
1501     $self->{state} = DATA_STATE;
1502     !!!next-input-character;
1503    
1504     !!!emit ($self->{ct}); # comment
1505    
1506     redo A;
1507     } elsif ($self->{nc} == -1) {
1508     !!!cp (139);
1509     !!!parse-error (type => 'unclosed comment');
1510     $self->{state} = DATA_STATE;
1511     ## reconsume
1512    
1513     !!!emit ($self->{ct}); # comment
1514    
1515     redo A;
1516     } else {
1517     !!!cp (140);
1518     $self->{ct}->{data} # comment
1519     .= chr ($self->{nc});
1520     $self->{state} = COMMENT_STATE;
1521     !!!next-input-character;
1522     redo A;
1523     }
1524     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1525     if ($self->{nc} == 0x002D) { # -
1526     !!!cp (141);
1527     $self->{state} = COMMENT_END_STATE;
1528     !!!next-input-character;
1529     redo A;
1530     } elsif ($self->{nc} == 0x003E) { # >
1531     !!!cp (142);
1532     !!!parse-error (type => 'bogus comment');
1533     $self->{state} = DATA_STATE;
1534     !!!next-input-character;
1535    
1536     !!!emit ($self->{ct}); # comment
1537    
1538     redo A;
1539     } elsif ($self->{nc} == -1) {
1540     !!!cp (143);
1541     !!!parse-error (type => 'unclosed comment');
1542     $self->{state} = DATA_STATE;
1543     ## reconsume
1544    
1545     !!!emit ($self->{ct}); # comment
1546    
1547     redo A;
1548     } else {
1549     !!!cp (144);
1550     $self->{ct}->{data} # comment
1551     .= '-' . chr ($self->{nc});
1552     $self->{state} = COMMENT_STATE;
1553     !!!next-input-character;
1554     redo A;
1555     }
1556     } elsif ($self->{state} == COMMENT_STATE) {
1557     if ($self->{nc} == 0x002D) { # -
1558     !!!cp (145);
1559     $self->{state} = COMMENT_END_DASH_STATE;
1560     !!!next-input-character;
1561     redo A;
1562     } elsif ($self->{nc} == -1) {
1563     !!!cp (146);
1564     !!!parse-error (type => 'unclosed comment');
1565     $self->{state} = DATA_STATE;
1566     ## reconsume
1567    
1568     !!!emit ($self->{ct}); # comment
1569    
1570     redo A;
1571     } else {
1572     !!!cp (147);
1573     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1574     $self->{read_until}->($self->{ct}->{data},
1575     q[-],
1576     length $self->{ct}->{data});
1577    
1578     ## Stay in the state
1579     !!!next-input-character;
1580     redo A;
1581     }
1582     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1583     if ($self->{nc} == 0x002D) { # -
1584     !!!cp (148);
1585     $self->{state} = COMMENT_END_STATE;
1586     !!!next-input-character;
1587     redo A;
1588     } elsif ($self->{nc} == -1) {
1589     !!!cp (149);
1590     !!!parse-error (type => 'unclosed comment');
1591     $self->{state} = DATA_STATE;
1592     ## reconsume
1593    
1594     !!!emit ($self->{ct}); # comment
1595    
1596     redo A;
1597     } else {
1598     !!!cp (150);
1599     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1600     $self->{state} = COMMENT_STATE;
1601     !!!next-input-character;
1602     redo A;
1603     }
1604     } elsif ($self->{state} == COMMENT_END_STATE) {
1605     if ($self->{nc} == 0x003E) { # >
1606     !!!cp (151);
1607     $self->{state} = DATA_STATE;
1608     !!!next-input-character;
1609    
1610     !!!emit ($self->{ct}); # comment
1611    
1612     redo A;
1613     } elsif ($self->{nc} == 0x002D) { # -
1614     !!!cp (152);
1615     !!!parse-error (type => 'dash in comment',
1616     line => $self->{line_prev},
1617     column => $self->{column_prev});
1618     $self->{ct}->{data} .= '-'; # comment
1619     ## Stay in the state
1620     !!!next-input-character;
1621     redo A;
1622     } elsif ($self->{nc} == -1) {
1623     !!!cp (153);
1624     !!!parse-error (type => 'unclosed comment');
1625     $self->{state} = DATA_STATE;
1626     ## reconsume
1627    
1628     !!!emit ($self->{ct}); # comment
1629    
1630     redo A;
1631     } else {
1632     !!!cp (154);
1633     !!!parse-error (type => 'dash in comment',
1634     line => $self->{line_prev},
1635     column => $self->{column_prev});
1636     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1637     $self->{state} = COMMENT_STATE;
1638     !!!next-input-character;
1639     redo A;
1640     }
1641     } elsif ($self->{state} == DOCTYPE_STATE) {
1642     if ($is_space->{$self->{nc}}) {
1643     !!!cp (155);
1644     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1645     !!!next-input-character;
1646     redo A;
1647     } else {
1648     !!!cp (156);
1649     !!!parse-error (type => 'no space before DOCTYPE name');
1650     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1651     ## reconsume
1652     redo A;
1653     }
1654     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1655     if ($is_space->{$self->{nc}}) {
1656     !!!cp (157);
1657     ## Stay in the state
1658     !!!next-input-character;
1659     redo A;
1660     } elsif ($self->{nc} == 0x003E) { # >
1661     !!!cp (158);
1662     !!!parse-error (type => 'no DOCTYPE name');
1663     $self->{state} = DATA_STATE;
1664     !!!next-input-character;
1665    
1666     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1667    
1668     redo A;
1669     } elsif ($self->{nc} == -1) {
1670     !!!cp (159);
1671     !!!parse-error (type => 'no DOCTYPE name');
1672     $self->{state} = DATA_STATE;
1673     ## reconsume
1674    
1675     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1676    
1677     redo A;
1678     } else {
1679     !!!cp (160);
1680     $self->{ct}->{name} = chr $self->{nc};
1681     delete $self->{ct}->{quirks};
1682     $self->{state} = DOCTYPE_NAME_STATE;
1683     !!!next-input-character;
1684     redo A;
1685     }
1686     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1687     ## ISSUE: Redundant "First," in the spec.
1688     if ($is_space->{$self->{nc}}) {
1689     !!!cp (161);
1690     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1691     !!!next-input-character;
1692     redo A;
1693     } elsif ($self->{nc} == 0x003E) { # >
1694     !!!cp (162);
1695     $self->{state} = DATA_STATE;
1696     !!!next-input-character;
1697    
1698     !!!emit ($self->{ct}); # DOCTYPE
1699    
1700     redo A;
1701     } elsif ($self->{nc} == -1) {
1702     !!!cp (163);
1703     !!!parse-error (type => 'unclosed DOCTYPE');
1704     $self->{state} = DATA_STATE;
1705     ## reconsume
1706    
1707     $self->{ct}->{quirks} = 1;
1708     !!!emit ($self->{ct}); # DOCTYPE
1709    
1710     redo A;
1711     } else {
1712     !!!cp (164);
1713     $self->{ct}->{name}
1714     .= chr ($self->{nc}); # DOCTYPE
1715     ## Stay in the state
1716     !!!next-input-character;
1717     redo A;
1718     }
1719     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1720     if ($is_space->{$self->{nc}}) {
1721     !!!cp (165);
1722     ## Stay in the state
1723     !!!next-input-character;
1724     redo A;
1725     } elsif ($self->{nc} == 0x003E) { # >
1726     !!!cp (166);
1727     $self->{state} = DATA_STATE;
1728     !!!next-input-character;
1729    
1730     !!!emit ($self->{ct}); # DOCTYPE
1731    
1732     redo A;
1733     } elsif ($self->{nc} == -1) {
1734     !!!cp (167);
1735     !!!parse-error (type => 'unclosed DOCTYPE');
1736     $self->{state} = DATA_STATE;
1737     ## reconsume
1738    
1739     $self->{ct}->{quirks} = 1;
1740     !!!emit ($self->{ct}); # DOCTYPE
1741    
1742     redo A;
1743     } elsif ($self->{nc} == 0x0050 or # P
1744     $self->{nc} == 0x0070) { # p
1745     $self->{state} = PUBLIC_STATE;
1746     $self->{s_kwd} = chr $self->{nc};
1747     !!!next-input-character;
1748     redo A;
1749     } elsif ($self->{nc} == 0x0053 or # S
1750     $self->{nc} == 0x0073) { # s
1751     $self->{state} = SYSTEM_STATE;
1752     $self->{s_kwd} = chr $self->{nc};
1753     !!!next-input-character;
1754     redo A;
1755     } else {
1756     !!!cp (180);
1757     !!!parse-error (type => 'string after DOCTYPE name');
1758     $self->{ct}->{quirks} = 1;
1759    
1760     $self->{state} = BOGUS_DOCTYPE_STATE;
1761     !!!next-input-character;
1762     redo A;
1763     }
1764     } elsif ($self->{state} == PUBLIC_STATE) {
1765     ## ASCII case-insensitive
1766     if ($self->{nc} == [
1767     undef,
1768     0x0055, # U
1769     0x0042, # B
1770     0x004C, # L
1771     0x0049, # I
1772     ]->[length $self->{s_kwd}] or
1773     $self->{nc} == [
1774     undef,
1775     0x0075, # u
1776     0x0062, # b
1777     0x006C, # l
1778     0x0069, # i
1779     ]->[length $self->{s_kwd}]) {
1780     !!!cp (175);
1781     ## Stay in the state.
1782     $self->{s_kwd} .= chr $self->{nc};
1783     !!!next-input-character;
1784     redo A;
1785     } elsif ((length $self->{s_kwd}) == 5 and
1786     ($self->{nc} == 0x0043 or # C
1787     $self->{nc} == 0x0063)) { # c
1788     !!!cp (168);
1789     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1790     !!!next-input-character;
1791     redo A;
1792     } else {
1793     !!!cp (169);
1794     !!!parse-error (type => 'string after DOCTYPE name',
1795     line => $self->{line_prev},
1796     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1797     $self->{ct}->{quirks} = 1;
1798    
1799     $self->{state} = BOGUS_DOCTYPE_STATE;
1800     ## Reconsume.
1801     redo A;
1802     }
1803     } elsif ($self->{state} == SYSTEM_STATE) {
1804     ## ASCII case-insensitive
1805     if ($self->{nc} == [
1806     undef,
1807     0x0059, # Y
1808     0x0053, # S
1809     0x0054, # T
1810     0x0045, # E
1811     ]->[length $self->{s_kwd}] or
1812     $self->{nc} == [
1813     undef,
1814     0x0079, # y
1815     0x0073, # s
1816     0x0074, # t
1817     0x0065, # e
1818     ]->[length $self->{s_kwd}]) {
1819     !!!cp (170);
1820     ## Stay in the state.
1821     $self->{s_kwd} .= chr $self->{nc};
1822     !!!next-input-character;
1823     redo A;
1824     } elsif ((length $self->{s_kwd}) == 5 and
1825     ($self->{nc} == 0x004D or # M
1826     $self->{nc} == 0x006D)) { # m
1827     !!!cp (171);
1828     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1829     !!!next-input-character;
1830     redo A;
1831     } else {
1832     !!!cp (172);
1833     !!!parse-error (type => 'string after DOCTYPE name',
1834     line => $self->{line_prev},
1835     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1836     $self->{ct}->{quirks} = 1;
1837    
1838     $self->{state} = BOGUS_DOCTYPE_STATE;
1839     ## Reconsume.
1840     redo A;
1841     }
1842     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1843     if ($is_space->{$self->{nc}}) {
1844     !!!cp (181);
1845     ## Stay in the state
1846     !!!next-input-character;
1847     redo A;
1848     } elsif ($self->{nc} eq 0x0022) { # "
1849     !!!cp (182);
1850     $self->{ct}->{pubid} = ''; # DOCTYPE
1851     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1852     !!!next-input-character;
1853     redo A;
1854     } elsif ($self->{nc} eq 0x0027) { # '
1855     !!!cp (183);
1856     $self->{ct}->{pubid} = ''; # DOCTYPE
1857     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1858     !!!next-input-character;
1859     redo A;
1860     } elsif ($self->{nc} eq 0x003E) { # >
1861     !!!cp (184);
1862     !!!parse-error (type => 'no PUBLIC literal');
1863    
1864     $self->{state} = DATA_STATE;
1865     !!!next-input-character;
1866    
1867     $self->{ct}->{quirks} = 1;
1868     !!!emit ($self->{ct}); # DOCTYPE
1869    
1870     redo A;
1871     } elsif ($self->{nc} == -1) {
1872     !!!cp (185);
1873     !!!parse-error (type => 'unclosed DOCTYPE');
1874    
1875     $self->{state} = DATA_STATE;
1876     ## reconsume
1877    
1878     $self->{ct}->{quirks} = 1;
1879     !!!emit ($self->{ct}); # DOCTYPE
1880    
1881     redo A;
1882     } else {
1883     !!!cp (186);
1884     !!!parse-error (type => 'string after PUBLIC');
1885     $self->{ct}->{quirks} = 1;
1886    
1887     $self->{state} = BOGUS_DOCTYPE_STATE;
1888     !!!next-input-character;
1889     redo A;
1890     }
1891     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1892     if ($self->{nc} == 0x0022) { # "
1893     !!!cp (187);
1894     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1895     !!!next-input-character;
1896     redo A;
1897     } elsif ($self->{nc} == 0x003E) { # >
1898     !!!cp (188);
1899     !!!parse-error (type => 'unclosed PUBLIC literal');
1900    
1901     $self->{state} = DATA_STATE;
1902     !!!next-input-character;
1903    
1904     $self->{ct}->{quirks} = 1;
1905     !!!emit ($self->{ct}); # DOCTYPE
1906    
1907     redo A;
1908     } elsif ($self->{nc} == -1) {
1909     !!!cp (189);
1910     !!!parse-error (type => 'unclosed PUBLIC literal');
1911    
1912     $self->{state} = DATA_STATE;
1913     ## reconsume
1914    
1915     $self->{ct}->{quirks} = 1;
1916     !!!emit ($self->{ct}); # DOCTYPE
1917    
1918     redo A;
1919     } else {
1920     !!!cp (190);
1921     $self->{ct}->{pubid} # DOCTYPE
1922     .= chr $self->{nc};
1923     $self->{read_until}->($self->{ct}->{pubid}, q[">],
1924     length $self->{ct}->{pubid});
1925    
1926     ## Stay in the state
1927     !!!next-input-character;
1928     redo A;
1929     }
1930     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1931     if ($self->{nc} == 0x0027) { # '
1932     !!!cp (191);
1933     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1934     !!!next-input-character;
1935     redo A;
1936     } elsif ($self->{nc} == 0x003E) { # >
1937     !!!cp (192);
1938     !!!parse-error (type => 'unclosed PUBLIC literal');
1939    
1940     $self->{state} = DATA_STATE;
1941     !!!next-input-character;
1942    
1943     $self->{ct}->{quirks} = 1;
1944     !!!emit ($self->{ct}); # DOCTYPE
1945    
1946     redo A;
1947     } elsif ($self->{nc} == -1) {
1948     !!!cp (193);
1949     !!!parse-error (type => 'unclosed PUBLIC literal');
1950    
1951     $self->{state} = DATA_STATE;
1952     ## reconsume
1953    
1954     $self->{ct}->{quirks} = 1;
1955     !!!emit ($self->{ct}); # DOCTYPE
1956    
1957     redo A;
1958     } else {
1959     !!!cp (194);
1960     $self->{ct}->{pubid} # DOCTYPE
1961     .= chr $self->{nc};
1962     $self->{read_until}->($self->{ct}->{pubid}, q['>],
1963     length $self->{ct}->{pubid});
1964    
1965     ## Stay in the state
1966     !!!next-input-character;
1967     redo A;
1968     }
1969     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1970     if ($is_space->{$self->{nc}}) {
1971     !!!cp (195);
1972     ## Stay in the state
1973     !!!next-input-character;
1974     redo A;
1975     } elsif ($self->{nc} == 0x0022) { # "
1976     !!!cp (196);
1977     $self->{ct}->{sysid} = ''; # DOCTYPE
1978     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1979     !!!next-input-character;
1980     redo A;
1981     } elsif ($self->{nc} == 0x0027) { # '
1982     !!!cp (197);
1983     $self->{ct}->{sysid} = ''; # DOCTYPE
1984     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1985     !!!next-input-character;
1986     redo A;
1987     } elsif ($self->{nc} == 0x003E) { # >
1988     !!!cp (198);
1989     $self->{state} = DATA_STATE;
1990     !!!next-input-character;
1991    
1992     !!!emit ($self->{ct}); # DOCTYPE
1993    
1994     redo A;
1995     } elsif ($self->{nc} == -1) {
1996     !!!cp (199);
1997     !!!parse-error (type => 'unclosed DOCTYPE');
1998    
1999     $self->{state} = DATA_STATE;
2000     ## reconsume
2001    
2002     $self->{ct}->{quirks} = 1;
2003     !!!emit ($self->{ct}); # DOCTYPE
2004    
2005     redo A;
2006     } else {
2007     !!!cp (200);
2008     !!!parse-error (type => 'string after PUBLIC literal');
2009     $self->{ct}->{quirks} = 1;
2010    
2011     $self->{state} = BOGUS_DOCTYPE_STATE;
2012     !!!next-input-character;
2013     redo A;
2014     }
2015     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2016     if ($is_space->{$self->{nc}}) {
2017     !!!cp (201);
2018     ## Stay in the state
2019     !!!next-input-character;
2020     redo A;
2021     } elsif ($self->{nc} == 0x0022) { # "
2022     !!!cp (202);
2023     $self->{ct}->{sysid} = ''; # DOCTYPE
2024     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2025     !!!next-input-character;
2026     redo A;
2027     } elsif ($self->{nc} == 0x0027) { # '
2028     !!!cp (203);
2029     $self->{ct}->{sysid} = ''; # DOCTYPE
2030     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2031     !!!next-input-character;
2032     redo A;
2033     } elsif ($self->{nc} == 0x003E) { # >
2034     !!!cp (204);
2035     !!!parse-error (type => 'no SYSTEM literal');
2036     $self->{state} = DATA_STATE;
2037     !!!next-input-character;
2038    
2039     $self->{ct}->{quirks} = 1;
2040     !!!emit ($self->{ct}); # DOCTYPE
2041    
2042     redo A;
2043     } elsif ($self->{nc} == -1) {
2044     !!!cp (205);
2045     !!!parse-error (type => 'unclosed DOCTYPE');
2046    
2047     $self->{state} = DATA_STATE;
2048     ## reconsume
2049    
2050     $self->{ct}->{quirks} = 1;
2051     !!!emit ($self->{ct}); # DOCTYPE
2052    
2053     redo A;
2054     } else {
2055     !!!cp (206);
2056     !!!parse-error (type => 'string after SYSTEM');
2057     $self->{ct}->{quirks} = 1;
2058    
2059     $self->{state} = BOGUS_DOCTYPE_STATE;
2060     !!!next-input-character;
2061     redo A;
2062     }
2063     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2064     if ($self->{nc} == 0x0022) { # "
2065     !!!cp (207);
2066     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2067     !!!next-input-character;
2068     redo A;
2069     } elsif ($self->{nc} == 0x003E) { # >
2070     !!!cp (208);
2071     !!!parse-error (type => 'unclosed SYSTEM literal');
2072    
2073     $self->{state} = DATA_STATE;
2074     !!!next-input-character;
2075    
2076     $self->{ct}->{quirks} = 1;
2077     !!!emit ($self->{ct}); # DOCTYPE
2078    
2079     redo A;
2080     } elsif ($self->{nc} == -1) {
2081     !!!cp (209);
2082     !!!parse-error (type => 'unclosed SYSTEM literal');
2083    
2084     $self->{state} = DATA_STATE;
2085     ## reconsume
2086    
2087     $self->{ct}->{quirks} = 1;
2088     !!!emit ($self->{ct}); # DOCTYPE
2089    
2090     redo A;
2091     } else {
2092     !!!cp (210);
2093     $self->{ct}->{sysid} # DOCTYPE
2094     .= chr $self->{nc};
2095     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2096     length $self->{ct}->{sysid});
2097    
2098     ## Stay in the state
2099     !!!next-input-character;
2100     redo A;
2101     }
2102     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2103     if ($self->{nc} == 0x0027) { # '
2104     !!!cp (211);
2105     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2106     !!!next-input-character;
2107     redo A;
2108     } elsif ($self->{nc} == 0x003E) { # >
2109     !!!cp (212);
2110     !!!parse-error (type => 'unclosed SYSTEM literal');
2111    
2112     $self->{state} = DATA_STATE;
2113     !!!next-input-character;
2114    
2115     $self->{ct}->{quirks} = 1;
2116     !!!emit ($self->{ct}); # DOCTYPE
2117    
2118     redo A;
2119     } elsif ($self->{nc} == -1) {
2120     !!!cp (213);
2121     !!!parse-error (type => 'unclosed SYSTEM literal');
2122    
2123     $self->{state} = DATA_STATE;
2124     ## reconsume
2125    
2126     $self->{ct}->{quirks} = 1;
2127     !!!emit ($self->{ct}); # DOCTYPE
2128    
2129     redo A;
2130     } else {
2131     !!!cp (214);
2132     $self->{ct}->{sysid} # DOCTYPE
2133     .= chr $self->{nc};
2134     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2135     length $self->{ct}->{sysid});
2136    
2137     ## Stay in the state
2138     !!!next-input-character;
2139     redo A;
2140     }
2141     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2142     if ($is_space->{$self->{nc}}) {
2143     !!!cp (215);
2144     ## Stay in the state
2145     !!!next-input-character;
2146     redo A;
2147     } elsif ($self->{nc} == 0x003E) { # >
2148     !!!cp (216);
2149     $self->{state} = DATA_STATE;
2150     !!!next-input-character;
2151    
2152     !!!emit ($self->{ct}); # DOCTYPE
2153    
2154     redo A;
2155     } elsif ($self->{nc} == -1) {
2156     !!!cp (217);
2157     !!!parse-error (type => 'unclosed DOCTYPE');
2158     $self->{state} = DATA_STATE;
2159     ## reconsume
2160    
2161     $self->{ct}->{quirks} = 1;
2162     !!!emit ($self->{ct}); # DOCTYPE
2163    
2164     redo A;
2165     } else {
2166     !!!cp (218);
2167     !!!parse-error (type => 'string after SYSTEM literal');
2168     #$self->{ct}->{quirks} = 1;
2169    
2170     $self->{state} = BOGUS_DOCTYPE_STATE;
2171     !!!next-input-character;
2172     redo A;
2173     }
2174     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2175     if ($self->{nc} == 0x003E) { # >
2176     !!!cp (219);
2177     $self->{state} = DATA_STATE;
2178     !!!next-input-character;
2179    
2180     !!!emit ($self->{ct}); # DOCTYPE
2181    
2182     redo A;
2183     } elsif ($self->{nc} == -1) {
2184     !!!cp (220);
2185     $self->{state} = DATA_STATE;
2186     ## reconsume
2187    
2188     !!!emit ($self->{ct}); # DOCTYPE
2189    
2190     redo A;
2191     } else {
2192     !!!cp (221);
2193     my $s = '';
2194     $self->{read_until}->($s, q[>], 0);
2195    
2196     ## Stay in the state
2197     !!!next-input-character;
2198     redo A;
2199     }
2200     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2201     ## NOTE: "CDATA section state" in the state is jointly implemented
2202     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2203     ## and |CDATA_SECTION_MSE2_STATE|.
2204    
2205     if ($self->{nc} == 0x005D) { # ]
2206     !!!cp (221.1);
2207     $self->{state} = CDATA_SECTION_MSE1_STATE;
2208     !!!next-input-character;
2209     redo A;
2210     } elsif ($self->{nc} == -1) {
2211     $self->{state} = DATA_STATE;
2212     !!!next-input-character;
2213     if (length $self->{ct}->{data}) { # character
2214     !!!cp (221.2);
2215     !!!emit ($self->{ct}); # character
2216     } else {
2217     !!!cp (221.3);
2218     ## No token to emit. $self->{ct} is discarded.
2219     }
2220     redo A;
2221     } else {
2222     !!!cp (221.4);
2223     $self->{ct}->{data} .= chr $self->{nc};
2224     $self->{read_until}->($self->{ct}->{data},
2225     q<]>,
2226     length $self->{ct}->{data});
2227    
2228     ## Stay in the state.
2229     !!!next-input-character;
2230     redo A;
2231     }
2232    
2233     ## ISSUE: "text tokens" in spec.
2234     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2235     if ($self->{nc} == 0x005D) { # ]
2236     !!!cp (221.5);
2237     $self->{state} = CDATA_SECTION_MSE2_STATE;
2238     !!!next-input-character;
2239     redo A;
2240     } else {
2241     !!!cp (221.6);
2242     $self->{ct}->{data} .= ']';
2243     $self->{state} = CDATA_SECTION_STATE;
2244     ## Reconsume.
2245     redo A;
2246     }
2247     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2248     if ($self->{nc} == 0x003E) { # >
2249     $self->{state} = DATA_STATE;
2250     !!!next-input-character;
2251     if (length $self->{ct}->{data}) { # character
2252     !!!cp (221.7);
2253     !!!emit ($self->{ct}); # character
2254     } else {
2255     !!!cp (221.8);
2256     ## No token to emit. $self->{ct} is discarded.
2257     }
2258     redo A;
2259     } elsif ($self->{nc} == 0x005D) { # ]
2260     !!!cp (221.9); # character
2261     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2262     ## Stay in the state.
2263     !!!next-input-character;
2264     redo A;
2265     } else {
2266     !!!cp (221.11);
2267     $self->{ct}->{data} .= ']]'; # character
2268     $self->{state} = CDATA_SECTION_STATE;
2269     ## Reconsume.
2270     redo A;
2271     }
2272     } elsif ($self->{state} == ENTITY_STATE) {
2273     if ($is_space->{$self->{nc}} or
2274     {
2275     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2276     $self->{entity_add} => 1,
2277     }->{$self->{nc}}) {
2278     !!!cp (1001);
2279     ## Don't consume
2280     ## No error
2281     ## Return nothing.
2282     #
2283     } elsif ($self->{nc} == 0x0023) { # #
2284     !!!cp (999);
2285     $self->{state} = ENTITY_HASH_STATE;
2286     $self->{s_kwd} = '#';
2287     !!!next-input-character;
2288     redo A;
2289     } elsif ((0x0041 <= $self->{nc} and
2290     $self->{nc} <= 0x005A) or # A..Z
2291     (0x0061 <= $self->{nc} and
2292     $self->{nc} <= 0x007A)) { # a..z
2293     !!!cp (998);
2294     require Whatpm::_NamedEntityList;
2295     $self->{state} = ENTITY_NAME_STATE;
2296     $self->{s_kwd} = chr $self->{nc};
2297     $self->{entity__value} = $self->{s_kwd};
2298     $self->{entity__match} = 0;
2299     !!!next-input-character;
2300     redo A;
2301     } else {
2302     !!!cp (1027);
2303     !!!parse-error (type => 'bare ero');
2304     ## Return nothing.
2305     #
2306     }
2307    
2308     ## NOTE: No character is consumed by the "consume a character
2309     ## reference" algorithm. In other word, there is an "&" character
2310     ## that does not introduce a character reference, which would be
2311     ## appended to the parent element or the attribute value in later
2312     ## process of the tokenizer.
2313    
2314     if ($self->{prev_state} == DATA_STATE) {
2315     !!!cp (997);
2316     $self->{state} = $self->{prev_state};
2317     ## Reconsume.
2318     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2319     line => $self->{line_prev},
2320     column => $self->{column_prev},
2321     });
2322     redo A;
2323     } else {
2324     !!!cp (996);
2325     $self->{ca}->{value} .= '&';
2326     $self->{state} = $self->{prev_state};
2327     ## Reconsume.
2328     redo A;
2329     }
2330     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2331     if ($self->{nc} == 0x0078 or # x
2332     $self->{nc} == 0x0058) { # X
2333     !!!cp (995);
2334     $self->{state} = HEXREF_X_STATE;
2335     $self->{s_kwd} .= chr $self->{nc};
2336     !!!next-input-character;
2337     redo A;
2338     } elsif (0x0030 <= $self->{nc} and
2339     $self->{nc} <= 0x0039) { # 0..9
2340     !!!cp (994);
2341     $self->{state} = NCR_NUM_STATE;
2342     $self->{s_kwd} = $self->{nc} - 0x0030;
2343     !!!next-input-character;
2344     redo A;
2345     } else {
2346     !!!parse-error (type => 'bare nero',
2347     line => $self->{line_prev},
2348     column => $self->{column_prev} - 1);
2349    
2350     ## NOTE: According to the spec algorithm, nothing is returned,
2351     ## and then "&#" is appended to the parent element or the attribute
2352     ## value in the later processing.
2353    
2354     if ($self->{prev_state} == DATA_STATE) {
2355     !!!cp (1019);
2356     $self->{state} = $self->{prev_state};
2357     ## Reconsume.
2358     !!!emit ({type => CHARACTER_TOKEN,
2359     data => '&#',
2360     line => $self->{line_prev},
2361     column => $self->{column_prev} - 1,
2362     });
2363     redo A;
2364     } else {
2365     !!!cp (993);
2366     $self->{ca}->{value} .= '&#';
2367     $self->{state} = $self->{prev_state};
2368     ## Reconsume.
2369     redo A;
2370     }
2371     }
2372     } elsif ($self->{state} == NCR_NUM_STATE) {
2373     if (0x0030 <= $self->{nc} and
2374     $self->{nc} <= 0x0039) { # 0..9
2375     !!!cp (1012);
2376     $self->{s_kwd} *= 10;
2377     $self->{s_kwd} += $self->{nc} - 0x0030;
2378    
2379     ## Stay in the state.
2380     !!!next-input-character;
2381     redo A;
2382     } elsif ($self->{nc} == 0x003B) { # ;
2383     !!!cp (1013);
2384     !!!next-input-character;
2385     #
2386     } else {
2387     !!!cp (1014);
2388     !!!parse-error (type => 'no refc');
2389     ## Reconsume.
2390     #
2391     }
2392    
2393     my $code = $self->{s_kwd};
2394     my $l = $self->{line_prev};
2395     my $c = $self->{column_prev};
2396     if ($charref_map->{$code}) {
2397     !!!cp (1015);
2398     !!!parse-error (type => 'invalid character reference',
2399     text => (sprintf 'U+%04X', $code),
2400     line => $l, column => $c);
2401     $code = $charref_map->{$code};
2402     } elsif ($code > 0x10FFFF) {
2403     !!!cp (1016);
2404     !!!parse-error (type => 'invalid character reference',
2405     text => (sprintf 'U-%08X', $code),
2406     line => $l, column => $c);
2407     $code = 0xFFFD;
2408     }
2409    
2410     if ($self->{prev_state} == DATA_STATE) {
2411     !!!cp (992);
2412     $self->{state} = $self->{prev_state};
2413     ## Reconsume.
2414     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2415     line => $l, column => $c,
2416     });
2417     redo A;
2418     } else {
2419     !!!cp (991);
2420     $self->{ca}->{value} .= chr $code;
2421     $self->{ca}->{has_reference} = 1;
2422     $self->{state} = $self->{prev_state};
2423     ## Reconsume.
2424     redo A;
2425     }
2426     } elsif ($self->{state} == HEXREF_X_STATE) {
2427     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2428     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2429     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2430     # 0..9, A..F, a..f
2431     !!!cp (990);
2432     $self->{state} = HEXREF_HEX_STATE;
2433     $self->{s_kwd} = 0;
2434     ## Reconsume.
2435     redo A;
2436     } else {
2437     !!!parse-error (type => 'bare hcro',
2438     line => $self->{line_prev},
2439     column => $self->{column_prev} - 2);
2440    
2441     ## NOTE: According to the spec algorithm, nothing is returned,
2442     ## and then "&#" followed by "X" or "x" is appended to the parent
2443     ## element or the attribute value in the later processing.
2444    
2445     if ($self->{prev_state} == DATA_STATE) {
2446     !!!cp (1005);
2447     $self->{state} = $self->{prev_state};
2448     ## Reconsume.
2449     !!!emit ({type => CHARACTER_TOKEN,
2450     data => '&' . $self->{s_kwd},
2451     line => $self->{line_prev},
2452     column => $self->{column_prev} - length $self->{s_kwd},
2453     });
2454     redo A;
2455     } else {
2456     !!!cp (989);
2457     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2458     $self->{state} = $self->{prev_state};
2459     ## Reconsume.
2460     redo A;
2461     }
2462     }
2463     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2464     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2465     # 0..9
2466     !!!cp (1002);
2467     $self->{s_kwd} *= 0x10;
2468     $self->{s_kwd} += $self->{nc} - 0x0030;
2469     ## Stay in the state.
2470     !!!next-input-character;
2471     redo A;
2472     } elsif (0x0061 <= $self->{nc} and
2473     $self->{nc} <= 0x0066) { # a..f
2474     !!!cp (1003);
2475     $self->{s_kwd} *= 0x10;
2476     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2477     ## Stay in the state.
2478     !!!next-input-character;
2479     redo A;
2480     } elsif (0x0041 <= $self->{nc} and
2481     $self->{nc} <= 0x0046) { # A..F
2482     !!!cp (1004);
2483     $self->{s_kwd} *= 0x10;
2484     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2485     ## Stay in the state.
2486     !!!next-input-character;
2487     redo A;
2488     } elsif ($self->{nc} == 0x003B) { # ;
2489     !!!cp (1006);
2490     !!!next-input-character;
2491     #
2492     } else {
2493     !!!cp (1007);
2494     !!!parse-error (type => 'no refc',
2495     line => $self->{line},
2496     column => $self->{column});
2497     ## Reconsume.
2498     #
2499     }
2500    
2501     my $code = $self->{s_kwd};
2502     my $l = $self->{line_prev};
2503     my $c = $self->{column_prev};
2504     if ($charref_map->{$code}) {
2505     !!!cp (1008);
2506     !!!parse-error (type => 'invalid character reference',
2507     text => (sprintf 'U+%04X', $code),
2508     line => $l, column => $c);
2509     $code = $charref_map->{$code};
2510     } elsif ($code > 0x10FFFF) {
2511     !!!cp (1009);
2512     !!!parse-error (type => 'invalid character reference',
2513     text => (sprintf 'U-%08X', $code),
2514     line => $l, column => $c);
2515     $code = 0xFFFD;
2516     }
2517    
2518     if ($self->{prev_state} == DATA_STATE) {
2519     !!!cp (988);
2520     $self->{state} = $self->{prev_state};
2521     ## Reconsume.
2522     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2523     line => $l, column => $c,
2524     });
2525     redo A;
2526     } else {
2527     !!!cp (987);
2528     $self->{ca}->{value} .= chr $code;
2529     $self->{ca}->{has_reference} = 1;
2530     $self->{state} = $self->{prev_state};
2531     ## Reconsume.
2532     redo A;
2533     }
2534     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2535     if (length $self->{s_kwd} < 30 and
2536     ## NOTE: Some number greater than the maximum length of entity name
2537     ((0x0041 <= $self->{nc} and # a
2538     $self->{nc} <= 0x005A) or # x
2539     (0x0061 <= $self->{nc} and # a
2540     $self->{nc} <= 0x007A) or # z
2541     (0x0030 <= $self->{nc} and # 0
2542     $self->{nc} <= 0x0039) or # 9
2543     $self->{nc} == 0x003B)) { # ;
2544     our $EntityChar;
2545     $self->{s_kwd} .= chr $self->{nc};
2546     if (defined $EntityChar->{$self->{s_kwd}}) {
2547     if ($self->{nc} == 0x003B) { # ;
2548     !!!cp (1020);
2549     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2550     $self->{entity__match} = 1;
2551     !!!next-input-character;
2552     #
2553     } else {
2554     !!!cp (1021);
2555     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2556     $self->{entity__match} = -1;
2557     ## Stay in the state.
2558     !!!next-input-character;
2559     redo A;
2560     }
2561     } else {
2562     !!!cp (1022);
2563     $self->{entity__value} .= chr $self->{nc};
2564     $self->{entity__match} *= 2;
2565     ## Stay in the state.
2566     !!!next-input-character;
2567     redo A;
2568     }
2569     }
2570    
2571     my $data;
2572     my $has_ref;
2573     if ($self->{entity__match} > 0) {
2574     !!!cp (1023);
2575     $data = $self->{entity__value};
2576     $has_ref = 1;
2577     #
2578     } elsif ($self->{entity__match} < 0) {
2579     !!!parse-error (type => 'no refc');
2580     if ($self->{prev_state} != DATA_STATE and # in attribute
2581     $self->{entity__match} < -1) {
2582     !!!cp (1024);
2583     $data = '&' . $self->{s_kwd};
2584     #
2585     } else {
2586     !!!cp (1025);
2587     $data = $self->{entity__value};
2588     $has_ref = 1;
2589     #
2590     }
2591     } else {
2592     !!!cp (1026);
2593     !!!parse-error (type => 'bare ero',
2594     line => $self->{line_prev},
2595     column => $self->{column_prev} - length $self->{s_kwd});
2596     $data = '&' . $self->{s_kwd};
2597     #
2598     }
2599    
2600     ## NOTE: In these cases, when a character reference is found,
2601     ## it is consumed and a character token is returned, or, otherwise,
2602     ## nothing is consumed and returned, according to the spec algorithm.
2603     ## In this implementation, anything that has been examined by the
2604     ## tokenizer is appended to the parent element or the attribute value
2605     ## as string, either literal string when no character reference or
2606     ## entity-replaced string otherwise, in this stage, since any characters
2607     ## that would not be consumed are appended in the data state or in an
2608     ## appropriate attribute value state anyway.
2609    
2610     if ($self->{prev_state} == DATA_STATE) {
2611     !!!cp (986);
2612     $self->{state} = $self->{prev_state};
2613     ## Reconsume.
2614     !!!emit ({type => CHARACTER_TOKEN,
2615     data => $data,
2616     line => $self->{line_prev},
2617     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2618     });
2619     redo A;
2620     } else {
2621     !!!cp (985);
2622     $self->{ca}->{value} .= $data;
2623     $self->{ca}->{has_reference} = 1 if $has_ref;
2624     $self->{state} = $self->{prev_state};
2625     ## Reconsume.
2626     redo A;
2627     }
2628     } else {
2629     die "$0: $self->{state}: Unknown state";
2630     }
2631     } # A
2632    
2633     die "$0: _get_next_token: unexpected case";
2634     } # _get_next_token
2635    
2636     1;
2637     ## $Date:$

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24