/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download) (as text)
Tue Oct 14 04:32:49 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.1: +44 -11 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 04:28:43 -0000
	* Tokenizer.pm.src: Make *_TOKEN (token type constants)
	exportable.  New token types, PI_TOKEN for XML and ABORT_TOKEN for
	document.write() or incremental parsing, are added for future
	extensions.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 04:27:29 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile, Parser.pm.src: New files.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.2 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178    
179     $self->{state} = DATA_STATE; # MUST
180     #$self->{s_kwd}; # state keyword - initialized when used
181     #$self->{entity__value}; # initialized when used
182     #$self->{entity__match}; # initialized when used
183     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
184     undef $self->{ct}; # current token
185     undef $self->{ca}; # current attribute
186     undef $self->{last_stag_name}; # last emitted start tag name
187     #$self->{prev_state}; # initialized when used
188     delete $self->{self_closing};
189     $self->{char_buffer} = '';
190     $self->{char_buffer_pos} = 0;
191     $self->{nc} = -1; # next input character
192     #$self->{next_nc}
193     !!!next-input-character;
194     $self->{token} = [];
195     # $self->{escape}
196     } # _initialize_tokenizer
197    
198     ## A token has:
199     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
200     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
201     ## ->{name} (DOCTYPE_TOKEN)
202     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
203     ## ->{pubid} (DOCTYPE_TOKEN)
204     ## ->{sysid} (DOCTYPE_TOKEN)
205     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
206     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
207     ## ->{name}
208     ## ->{value}
209     ## ->{has_reference} == 1 or 0
210     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
211     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
212     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
213     ## while the token is pushed back to the stack.
214    
215     ## Emitted token MUST immediately be handled by the tree construction state.
216    
217     ## Before each step, UA MAY check to see if either one of the scripts in
218     ## "list of scripts that will execute as soon as possible" or the first
219     ## script in the "list of scripts that will execute asynchronously",
220     ## has completed loading. If one has, then it MUST be executed
221     ## and removed from the list.
222    
223     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
224     ## (This requirement was dropped from HTML5 spec, unfortunately.)
225    
226     my $is_space = {
227     0x0009 => 1, # CHARACTER TABULATION (HT)
228     0x000A => 1, # LINE FEED (LF)
229     #0x000B => 0, # LINE TABULATION (VT)
230     0x000C => 1, # FORM FEED (FF)
231     #0x000D => 1, # CARRIAGE RETURN (CR)
232     0x0020 => 1, # SPACE (SP)
233     };
234    
235     sub _get_next_token ($) {
236     my $self = shift;
237    
238     if ($self->{self_closing}) {
239     !!!parse-error (type => 'nestc', token => $self->{ct});
240     ## NOTE: The |self_closing| flag is only set by start tag token.
241     ## In addition, when a start tag token is emitted, it is always set to
242     ## |ct|.
243     delete $self->{self_closing};
244     }
245    
246     if (@{$self->{token}}) {
247     $self->{self_closing} = $self->{token}->[0]->{self_closing};
248     return shift @{$self->{token}};
249     }
250    
251     A: {
252     if ($self->{state} == PCDATA_STATE) {
253     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
254    
255     if ($self->{nc} == 0x0026) { # &
256     !!!cp (0.1);
257     ## NOTE: In the spec, the tokenizer is switched to the
258     ## "entity data state". In this implementation, the tokenizer
259     ## is switched to the |ENTITY_STATE|, which is an implementation
260     ## of the "consume a character reference" algorithm.
261     $self->{entity_add} = -1;
262     $self->{prev_state} = DATA_STATE;
263     $self->{state} = ENTITY_STATE;
264     !!!next-input-character;
265     redo A;
266     } elsif ($self->{nc} == 0x003C) { # <
267     !!!cp (0.2);
268     $self->{state} = TAG_OPEN_STATE;
269     !!!next-input-character;
270     redo A;
271     } elsif ($self->{nc} == -1) {
272     !!!cp (0.3);
273     !!!emit ({type => END_OF_FILE_TOKEN,
274     line => $self->{line}, column => $self->{column}});
275     last A; ## TODO: ok?
276     } else {
277     !!!cp (0.4);
278     #
279     }
280    
281     # Anything else
282     my $token = {type => CHARACTER_TOKEN,
283     data => chr $self->{nc},
284     line => $self->{line}, column => $self->{column},
285     };
286     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
287    
288     ## Stay in the state.
289     !!!next-input-character;
290     !!!emit ($token);
291     redo A;
292     } elsif ($self->{state} == DATA_STATE) {
293     $self->{s_kwd} = '' unless defined $self->{s_kwd};
294     if ($self->{nc} == 0x0026) { # &
295     $self->{s_kwd} = '';
296     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
297     not $self->{escape}) {
298     !!!cp (1);
299     ## NOTE: In the spec, the tokenizer is switched to the
300     ## "entity data state". In this implementation, the tokenizer
301     ## is switched to the |ENTITY_STATE|, which is an implementation
302     ## of the "consume a character reference" algorithm.
303     $self->{entity_add} = -1;
304     $self->{prev_state} = DATA_STATE;
305     $self->{state} = ENTITY_STATE;
306     !!!next-input-character;
307     redo A;
308     } else {
309     !!!cp (2);
310     #
311     }
312     } elsif ($self->{nc} == 0x002D) { # -
313     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
314     $self->{s_kwd} .= '-';
315    
316     if ($self->{s_kwd} eq '<!--') {
317     !!!cp (3);
318     $self->{escape} = 1; # unless $self->{escape};
319     $self->{s_kwd} = '--';
320     #
321     } elsif ($self->{s_kwd} eq '---') {
322     !!!cp (4);
323     $self->{s_kwd} = '--';
324     #
325     } else {
326     !!!cp (5);
327     #
328     }
329     }
330    
331     #
332     } elsif ($self->{nc} == 0x0021) { # !
333     if (length $self->{s_kwd}) {
334     !!!cp (5.1);
335     $self->{s_kwd} .= '!';
336     #
337     } else {
338     !!!cp (5.2);
339     #$self->{s_kwd} = '';
340     #
341     }
342     #
343     } elsif ($self->{nc} == 0x003C) { # <
344     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
345     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
346     not $self->{escape})) {
347     !!!cp (6);
348     $self->{state} = TAG_OPEN_STATE;
349     !!!next-input-character;
350     redo A;
351     } else {
352     !!!cp (7);
353     $self->{s_kwd} = '';
354     #
355     }
356     } elsif ($self->{nc} == 0x003E) { # >
357     if ($self->{escape} and
358     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
359     if ($self->{s_kwd} eq '--') {
360     !!!cp (8);
361     delete $self->{escape};
362     } else {
363     !!!cp (9);
364     }
365     } else {
366     !!!cp (10);
367     }
368    
369     $self->{s_kwd} = '';
370     #
371     } elsif ($self->{nc} == -1) {
372     !!!cp (11);
373     $self->{s_kwd} = '';
374     !!!emit ({type => END_OF_FILE_TOKEN,
375     line => $self->{line}, column => $self->{column}});
376     last A; ## TODO: ok?
377     } else {
378     !!!cp (12);
379     $self->{s_kwd} = '';
380     #
381     }
382    
383     # Anything else
384     my $token = {type => CHARACTER_TOKEN,
385     data => chr $self->{nc},
386     line => $self->{line}, column => $self->{column},
387     };
388     if ($self->{read_until}->($token->{data}, q[-!<>&],
389     length $token->{data})) {
390     $self->{s_kwd} = '';
391     }
392    
393     ## Stay in the data state.
394     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
395     !!!cp (13);
396     $self->{state} = PCDATA_STATE;
397     } else {
398     !!!cp (14);
399     ## Stay in the state.
400     }
401     !!!next-input-character;
402     !!!emit ($token);
403     redo A;
404     } elsif ($self->{state} == TAG_OPEN_STATE) {
405     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
406     if ($self->{nc} == 0x002F) { # /
407     !!!cp (15);
408     !!!next-input-character;
409     $self->{state} = CLOSE_TAG_OPEN_STATE;
410     redo A;
411     } elsif ($self->{nc} == 0x0021) { # !
412     !!!cp (15.1);
413     $self->{s_kwd} = '<' unless $self->{escape};
414     #
415     } else {
416     !!!cp (16);
417     #
418     }
419    
420     ## reconsume
421     $self->{state} = DATA_STATE;
422     !!!emit ({type => CHARACTER_TOKEN, data => '<',
423     line => $self->{line_prev},
424     column => $self->{column_prev},
425     });
426     redo A;
427     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
428     if ($self->{nc} == 0x0021) { # !
429     !!!cp (17);
430     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
431     !!!next-input-character;
432     redo A;
433     } elsif ($self->{nc} == 0x002F) { # /
434     !!!cp (18);
435     $self->{state} = CLOSE_TAG_OPEN_STATE;
436     !!!next-input-character;
437     redo A;
438     } elsif (0x0041 <= $self->{nc} and
439     $self->{nc} <= 0x005A) { # A..Z
440     !!!cp (19);
441     $self->{ct}
442     = {type => START_TAG_TOKEN,
443     tag_name => chr ($self->{nc} + 0x0020),
444     line => $self->{line_prev},
445     column => $self->{column_prev}};
446     $self->{state} = TAG_NAME_STATE;
447     !!!next-input-character;
448     redo A;
449     } elsif (0x0061 <= $self->{nc} and
450     $self->{nc} <= 0x007A) { # a..z
451     !!!cp (20);
452     $self->{ct} = {type => START_TAG_TOKEN,
453     tag_name => chr ($self->{nc}),
454     line => $self->{line_prev},
455     column => $self->{column_prev}};
456     $self->{state} = TAG_NAME_STATE;
457     !!!next-input-character;
458     redo A;
459     } elsif ($self->{nc} == 0x003E) { # >
460     !!!cp (21);
461     !!!parse-error (type => 'empty start tag',
462     line => $self->{line_prev},
463     column => $self->{column_prev});
464     $self->{state} = DATA_STATE;
465     !!!next-input-character;
466    
467     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
468     line => $self->{line_prev},
469     column => $self->{column_prev},
470     });
471    
472     redo A;
473     } elsif ($self->{nc} == 0x003F) { # ?
474     !!!cp (22);
475     !!!parse-error (type => 'pio',
476     line => $self->{line_prev},
477     column => $self->{column_prev});
478     $self->{state} = BOGUS_COMMENT_STATE;
479     $self->{ct} = {type => COMMENT_TOKEN, data => '',
480     line => $self->{line_prev},
481     column => $self->{column_prev},
482     };
483     ## $self->{nc} is intentionally left as is
484     redo A;
485     } else {
486     !!!cp (23);
487     !!!parse-error (type => 'bare stago',
488     line => $self->{line_prev},
489     column => $self->{column_prev});
490     $self->{state} = DATA_STATE;
491     ## reconsume
492    
493     !!!emit ({type => CHARACTER_TOKEN, data => '<',
494     line => $self->{line_prev},
495     column => $self->{column_prev},
496     });
497    
498     redo A;
499     }
500     } else {
501     die "$0: $self->{content_model} in tag open";
502     }
503     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
504     ## NOTE: The "close tag open state" in the spec is implemented as
505     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
506    
507     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
508     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
509     if (defined $self->{last_stag_name}) {
510     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
511     $self->{s_kwd} = '';
512     ## Reconsume.
513     redo A;
514     } else {
515     ## No start tag token has ever been emitted
516     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
517     !!!cp (28);
518     $self->{state} = DATA_STATE;
519     ## Reconsume.
520     !!!emit ({type => CHARACTER_TOKEN, data => '</',
521     line => $l, column => $c,
522     });
523     redo A;
524     }
525     }
526    
527     if (0x0041 <= $self->{nc} and
528     $self->{nc} <= 0x005A) { # A..Z
529     !!!cp (29);
530     $self->{ct}
531     = {type => END_TAG_TOKEN,
532     tag_name => chr ($self->{nc} + 0x0020),
533     line => $l, column => $c};
534     $self->{state} = TAG_NAME_STATE;
535     !!!next-input-character;
536     redo A;
537     } elsif (0x0061 <= $self->{nc} and
538     $self->{nc} <= 0x007A) { # a..z
539     !!!cp (30);
540     $self->{ct} = {type => END_TAG_TOKEN,
541     tag_name => chr ($self->{nc}),
542     line => $l, column => $c};
543     $self->{state} = TAG_NAME_STATE;
544     !!!next-input-character;
545     redo A;
546     } elsif ($self->{nc} == 0x003E) { # >
547     !!!cp (31);
548     !!!parse-error (type => 'empty end tag',
549     line => $self->{line_prev}, ## "<" in "</>"
550     column => $self->{column_prev} - 1);
551     $self->{state} = DATA_STATE;
552     !!!next-input-character;
553     redo A;
554     } elsif ($self->{nc} == -1) {
555     !!!cp (32);
556     !!!parse-error (type => 'bare etago');
557     $self->{state} = DATA_STATE;
558     # reconsume
559    
560     !!!emit ({type => CHARACTER_TOKEN, data => '</',
561     line => $l, column => $c,
562     });
563    
564     redo A;
565     } else {
566     !!!cp (33);
567     !!!parse-error (type => 'bogus end tag');
568     $self->{state} = BOGUS_COMMENT_STATE;
569     $self->{ct} = {type => COMMENT_TOKEN, data => '',
570     line => $self->{line_prev}, # "<" of "</"
571     column => $self->{column_prev} - 1,
572     };
573     ## NOTE: $self->{nc} is intentionally left as is.
574     ## Although the "anything else" case of the spec not explicitly
575     ## states that the next input character is to be reconsumed,
576     ## it will be included to the |data| of the comment token
577     ## generated from the bogus end tag, as defined in the
578     ## "bogus comment state" entry.
579     redo A;
580     }
581     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
582     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
583     if (length $ch) {
584     my $CH = $ch;
585     $ch =~ tr/a-z/A-Z/;
586     my $nch = chr $self->{nc};
587     if ($nch eq $ch or $nch eq $CH) {
588     !!!cp (24);
589     ## Stay in the state.
590     $self->{s_kwd} .= $nch;
591     !!!next-input-character;
592     redo A;
593     } else {
594     !!!cp (25);
595     $self->{state} = DATA_STATE;
596     ## Reconsume.
597     !!!emit ({type => CHARACTER_TOKEN,
598     data => '</' . $self->{s_kwd},
599     line => $self->{line_prev},
600     column => $self->{column_prev} - 1 - length $self->{s_kwd},
601     });
602     redo A;
603     }
604     } else { # after "<{tag-name}"
605     unless ($is_space->{$self->{nc}} or
606     {
607     0x003E => 1, # >
608     0x002F => 1, # /
609     -1 => 1, # EOF
610     }->{$self->{nc}}) {
611     !!!cp (26);
612     ## Reconsume.
613     $self->{state} = DATA_STATE;
614     !!!emit ({type => CHARACTER_TOKEN,
615     data => '</' . $self->{s_kwd},
616     line => $self->{line_prev},
617     column => $self->{column_prev} - 1 - length $self->{s_kwd},
618     });
619     redo A;
620     } else {
621     !!!cp (27);
622     $self->{ct}
623     = {type => END_TAG_TOKEN,
624     tag_name => $self->{last_stag_name},
625     line => $self->{line_prev},
626     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
627     $self->{state} = TAG_NAME_STATE;
628     ## Reconsume.
629     redo A;
630     }
631     }
632     } elsif ($self->{state} == TAG_NAME_STATE) {
633     if ($is_space->{$self->{nc}}) {
634     !!!cp (34);
635     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
636     !!!next-input-character;
637     redo A;
638     } elsif ($self->{nc} == 0x003E) { # >
639     if ($self->{ct}->{type} == START_TAG_TOKEN) {
640     !!!cp (35);
641     $self->{last_stag_name} = $self->{ct}->{tag_name};
642     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
643     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
644     #if ($self->{ct}->{attributes}) {
645     # ## NOTE: This should never be reached.
646     # !!! cp (36);
647     # !!! parse-error (type => 'end tag attribute');
648     #} else {
649     !!!cp (37);
650     #}
651     } else {
652     die "$0: $self->{ct}->{type}: Unknown token type";
653     }
654     $self->{state} = DATA_STATE;
655     !!!next-input-character;
656    
657     !!!emit ($self->{ct}); # start tag or end tag
658    
659     redo A;
660     } elsif (0x0041 <= $self->{nc} and
661     $self->{nc} <= 0x005A) { # A..Z
662     !!!cp (38);
663     $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
664     # start tag or end tag
665     ## Stay in this state
666     !!!next-input-character;
667     redo A;
668     } elsif ($self->{nc} == -1) {
669     !!!parse-error (type => 'unclosed tag');
670     if ($self->{ct}->{type} == START_TAG_TOKEN) {
671     !!!cp (39);
672     $self->{last_stag_name} = $self->{ct}->{tag_name};
673     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
674     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
675     #if ($self->{ct}->{attributes}) {
676     # ## NOTE: This state should never be reached.
677     # !!! cp (40);
678     # !!! parse-error (type => 'end tag attribute');
679     #} else {
680     !!!cp (41);
681     #}
682     } else {
683     die "$0: $self->{ct}->{type}: Unknown token type";
684     }
685     $self->{state} = DATA_STATE;
686     # reconsume
687    
688     !!!emit ($self->{ct}); # start tag or end tag
689    
690     redo A;
691     } elsif ($self->{nc} == 0x002F) { # /
692     !!!cp (42);
693     $self->{state} = SELF_CLOSING_START_TAG_STATE;
694     !!!next-input-character;
695     redo A;
696     } else {
697     !!!cp (44);
698     $self->{ct}->{tag_name} .= chr $self->{nc};
699     # start tag or end tag
700     ## Stay in the state
701     !!!next-input-character;
702     redo A;
703     }
704     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
705     if ($is_space->{$self->{nc}}) {
706     !!!cp (45);
707     ## Stay in the state
708     !!!next-input-character;
709     redo A;
710     } elsif ($self->{nc} == 0x003E) { # >
711     if ($self->{ct}->{type} == START_TAG_TOKEN) {
712     !!!cp (46);
713     $self->{last_stag_name} = $self->{ct}->{tag_name};
714     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
715     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
716     if ($self->{ct}->{attributes}) {
717     !!!cp (47);
718     !!!parse-error (type => 'end tag attribute');
719     } else {
720     !!!cp (48);
721     }
722     } else {
723     die "$0: $self->{ct}->{type}: Unknown token type";
724     }
725     $self->{state} = DATA_STATE;
726     !!!next-input-character;
727    
728     !!!emit ($self->{ct}); # start tag or end tag
729    
730     redo A;
731     } elsif (0x0041 <= $self->{nc} and
732     $self->{nc} <= 0x005A) { # A..Z
733     !!!cp (49);
734     $self->{ca}
735     = {name => chr ($self->{nc} + 0x0020),
736     value => '',
737     line => $self->{line}, column => $self->{column}};
738     $self->{state} = ATTRIBUTE_NAME_STATE;
739     !!!next-input-character;
740     redo A;
741     } elsif ($self->{nc} == 0x002F) { # /
742     !!!cp (50);
743     $self->{state} = SELF_CLOSING_START_TAG_STATE;
744     !!!next-input-character;
745     redo A;
746     } elsif ($self->{nc} == -1) {
747     !!!parse-error (type => 'unclosed tag');
748     if ($self->{ct}->{type} == START_TAG_TOKEN) {
749     !!!cp (52);
750     $self->{last_stag_name} = $self->{ct}->{tag_name};
751     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
752     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
753     if ($self->{ct}->{attributes}) {
754     !!!cp (53);
755     !!!parse-error (type => 'end tag attribute');
756     } else {
757     !!!cp (54);
758     }
759     } else {
760     die "$0: $self->{ct}->{type}: Unknown token type";
761     }
762     $self->{state} = DATA_STATE;
763     # reconsume
764    
765     !!!emit ($self->{ct}); # start tag or end tag
766    
767     redo A;
768     } else {
769     if ({
770     0x0022 => 1, # "
771     0x0027 => 1, # '
772     0x003D => 1, # =
773     }->{$self->{nc}}) {
774     !!!cp (55);
775     !!!parse-error (type => 'bad attribute name');
776     } else {
777     !!!cp (56);
778     }
779     $self->{ca}
780     = {name => chr ($self->{nc}),
781     value => '',
782     line => $self->{line}, column => $self->{column}};
783     $self->{state} = ATTRIBUTE_NAME_STATE;
784     !!!next-input-character;
785     redo A;
786     }
787     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
788     my $before_leave = sub {
789     if (exists $self->{ct}->{attributes} # start tag or end tag
790     ->{$self->{ca}->{name}}) { # MUST
791     !!!cp (57);
792     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
793     ## Discard $self->{ca} # MUST
794     } else {
795     !!!cp (58);
796     $self->{ct}->{attributes}->{$self->{ca}->{name}}
797     = $self->{ca};
798     }
799     }; # $before_leave
800    
801     if ($is_space->{$self->{nc}}) {
802     !!!cp (59);
803     $before_leave->();
804     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
805     !!!next-input-character;
806     redo A;
807     } elsif ($self->{nc} == 0x003D) { # =
808     !!!cp (60);
809     $before_leave->();
810     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
811     !!!next-input-character;
812     redo A;
813     } elsif ($self->{nc} == 0x003E) { # >
814     $before_leave->();
815     if ($self->{ct}->{type} == START_TAG_TOKEN) {
816     !!!cp (61);
817     $self->{last_stag_name} = $self->{ct}->{tag_name};
818     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
819     !!!cp (62);
820     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
821     if ($self->{ct}->{attributes}) {
822     !!!parse-error (type => 'end tag attribute');
823     }
824     } else {
825     die "$0: $self->{ct}->{type}: Unknown token type";
826     }
827     $self->{state} = DATA_STATE;
828     !!!next-input-character;
829    
830     !!!emit ($self->{ct}); # start tag or end tag
831    
832     redo A;
833     } elsif (0x0041 <= $self->{nc} and
834     $self->{nc} <= 0x005A) { # A..Z
835     !!!cp (63);
836     $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
837     ## Stay in the state
838     !!!next-input-character;
839     redo A;
840     } elsif ($self->{nc} == 0x002F) { # /
841     !!!cp (64);
842     $before_leave->();
843     $self->{state} = SELF_CLOSING_START_TAG_STATE;
844     !!!next-input-character;
845     redo A;
846     } elsif ($self->{nc} == -1) {
847     !!!parse-error (type => 'unclosed tag');
848     $before_leave->();
849     if ($self->{ct}->{type} == START_TAG_TOKEN) {
850     !!!cp (66);
851     $self->{last_stag_name} = $self->{ct}->{tag_name};
852     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
853     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
854     if ($self->{ct}->{attributes}) {
855     !!!cp (67);
856     !!!parse-error (type => 'end tag attribute');
857     } else {
858     ## NOTE: This state should never be reached.
859     !!!cp (68);
860     }
861     } else {
862     die "$0: $self->{ct}->{type}: Unknown token type";
863     }
864     $self->{state} = DATA_STATE;
865     # reconsume
866    
867     !!!emit ($self->{ct}); # start tag or end tag
868    
869     redo A;
870     } else {
871     if ($self->{nc} == 0x0022 or # "
872     $self->{nc} == 0x0027) { # '
873     !!!cp (69);
874     !!!parse-error (type => 'bad attribute name');
875     } else {
876     !!!cp (70);
877     }
878     $self->{ca}->{name} .= chr ($self->{nc});
879     ## Stay in the state
880     !!!next-input-character;
881     redo A;
882     }
883     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
884     if ($is_space->{$self->{nc}}) {
885     !!!cp (71);
886     ## Stay in the state
887     !!!next-input-character;
888     redo A;
889     } elsif ($self->{nc} == 0x003D) { # =
890     !!!cp (72);
891     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
892     !!!next-input-character;
893     redo A;
894     } elsif ($self->{nc} == 0x003E) { # >
895     if ($self->{ct}->{type} == START_TAG_TOKEN) {
896     !!!cp (73);
897     $self->{last_stag_name} = $self->{ct}->{tag_name};
898     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
899     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
900     if ($self->{ct}->{attributes}) {
901     !!!cp (74);
902     !!!parse-error (type => 'end tag attribute');
903     } else {
904     ## NOTE: This state should never be reached.
905     !!!cp (75);
906     }
907     } else {
908     die "$0: $self->{ct}->{type}: Unknown token type";
909     }
910     $self->{state} = DATA_STATE;
911     !!!next-input-character;
912    
913     !!!emit ($self->{ct}); # start tag or end tag
914    
915     redo A;
916     } elsif (0x0041 <= $self->{nc} and
917     $self->{nc} <= 0x005A) { # A..Z
918     !!!cp (76);
919     $self->{ca}
920     = {name => chr ($self->{nc} + 0x0020),
921     value => '',
922     line => $self->{line}, column => $self->{column}};
923     $self->{state} = ATTRIBUTE_NAME_STATE;
924     !!!next-input-character;
925     redo A;
926     } elsif ($self->{nc} == 0x002F) { # /
927     !!!cp (77);
928     $self->{state} = SELF_CLOSING_START_TAG_STATE;
929     !!!next-input-character;
930     redo A;
931     } elsif ($self->{nc} == -1) {
932     !!!parse-error (type => 'unclosed tag');
933     if ($self->{ct}->{type} == START_TAG_TOKEN) {
934     !!!cp (79);
935     $self->{last_stag_name} = $self->{ct}->{tag_name};
936     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
937     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
938     if ($self->{ct}->{attributes}) {
939     !!!cp (80);
940     !!!parse-error (type => 'end tag attribute');
941     } else {
942     ## NOTE: This state should never be reached.
943     !!!cp (81);
944     }
945     } else {
946     die "$0: $self->{ct}->{type}: Unknown token type";
947     }
948     $self->{state} = DATA_STATE;
949     # reconsume
950    
951     !!!emit ($self->{ct}); # start tag or end tag
952    
953     redo A;
954     } else {
955     if ($self->{nc} == 0x0022 or # "
956     $self->{nc} == 0x0027) { # '
957     !!!cp (78);
958     !!!parse-error (type => 'bad attribute name');
959     } else {
960     !!!cp (82);
961     }
962     $self->{ca}
963     = {name => chr ($self->{nc}),
964     value => '',
965     line => $self->{line}, column => $self->{column}};
966     $self->{state} = ATTRIBUTE_NAME_STATE;
967     !!!next-input-character;
968     redo A;
969     }
970     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
971     if ($is_space->{$self->{nc}}) {
972     !!!cp (83);
973     ## Stay in the state
974     !!!next-input-character;
975     redo A;
976     } elsif ($self->{nc} == 0x0022) { # "
977     !!!cp (84);
978     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
979     !!!next-input-character;
980     redo A;
981     } elsif ($self->{nc} == 0x0026) { # &
982     !!!cp (85);
983     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
984     ## reconsume
985     redo A;
986     } elsif ($self->{nc} == 0x0027) { # '
987     !!!cp (86);
988     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
989     !!!next-input-character;
990     redo A;
991     } elsif ($self->{nc} == 0x003E) { # >
992     !!!parse-error (type => 'empty unquoted attribute value');
993     if ($self->{ct}->{type} == START_TAG_TOKEN) {
994     !!!cp (87);
995     $self->{last_stag_name} = $self->{ct}->{tag_name};
996     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
997     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
998     if ($self->{ct}->{attributes}) {
999     !!!cp (88);
1000     !!!parse-error (type => 'end tag attribute');
1001     } else {
1002     ## NOTE: This state should never be reached.
1003     !!!cp (89);
1004     }
1005     } else {
1006     die "$0: $self->{ct}->{type}: Unknown token type";
1007     }
1008     $self->{state} = DATA_STATE;
1009     !!!next-input-character;
1010    
1011     !!!emit ($self->{ct}); # start tag or end tag
1012    
1013     redo A;
1014     } elsif ($self->{nc} == -1) {
1015     !!!parse-error (type => 'unclosed tag');
1016     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1017     !!!cp (90);
1018     $self->{last_stag_name} = $self->{ct}->{tag_name};
1019     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1020     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1021     if ($self->{ct}->{attributes}) {
1022     !!!cp (91);
1023     !!!parse-error (type => 'end tag attribute');
1024     } else {
1025     ## NOTE: This state should never be reached.
1026     !!!cp (92);
1027     }
1028     } else {
1029     die "$0: $self->{ct}->{type}: Unknown token type";
1030     }
1031     $self->{state} = DATA_STATE;
1032     ## reconsume
1033    
1034     !!!emit ($self->{ct}); # start tag or end tag
1035    
1036     redo A;
1037     } else {
1038     if ($self->{nc} == 0x003D) { # =
1039     !!!cp (93);
1040     !!!parse-error (type => 'bad attribute value');
1041     } else {
1042     !!!cp (94);
1043     }
1044     $self->{ca}->{value} .= chr ($self->{nc});
1045     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1046     !!!next-input-character;
1047     redo A;
1048     }
1049     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1050     if ($self->{nc} == 0x0022) { # "
1051     !!!cp (95);
1052     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1053     !!!next-input-character;
1054     redo A;
1055     } elsif ($self->{nc} == 0x0026) { # &
1056     !!!cp (96);
1057     ## NOTE: In the spec, the tokenizer is switched to the
1058     ## "entity in attribute value state". In this implementation, the
1059     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1060     ## implementation of the "consume a character reference" algorithm.
1061     $self->{prev_state} = $self->{state};
1062     $self->{entity_add} = 0x0022; # "
1063     $self->{state} = ENTITY_STATE;
1064     !!!next-input-character;
1065     redo A;
1066     } elsif ($self->{nc} == -1) {
1067     !!!parse-error (type => 'unclosed attribute value');
1068     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1069     !!!cp (97);
1070     $self->{last_stag_name} = $self->{ct}->{tag_name};
1071     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1072     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1073     if ($self->{ct}->{attributes}) {
1074     !!!cp (98);
1075     !!!parse-error (type => 'end tag attribute');
1076     } else {
1077     ## NOTE: This state should never be reached.
1078     !!!cp (99);
1079     }
1080     } else {
1081     die "$0: $self->{ct}->{type}: Unknown token type";
1082     }
1083     $self->{state} = DATA_STATE;
1084     ## reconsume
1085    
1086     !!!emit ($self->{ct}); # start tag or end tag
1087    
1088     redo A;
1089     } else {
1090     !!!cp (100);
1091     $self->{ca}->{value} .= chr ($self->{nc});
1092     $self->{read_until}->($self->{ca}->{value},
1093     q["&],
1094     length $self->{ca}->{value});
1095    
1096     ## Stay in the state
1097     !!!next-input-character;
1098     redo A;
1099     }
1100     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1101     if ($self->{nc} == 0x0027) { # '
1102     !!!cp (101);
1103     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1104     !!!next-input-character;
1105     redo A;
1106     } elsif ($self->{nc} == 0x0026) { # &
1107     !!!cp (102);
1108     ## NOTE: In the spec, the tokenizer is switched to the
1109     ## "entity in attribute value state". In this implementation, the
1110     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1111     ## implementation of the "consume a character reference" algorithm.
1112     $self->{entity_add} = 0x0027; # '
1113     $self->{prev_state} = $self->{state};
1114     $self->{state} = ENTITY_STATE;
1115     !!!next-input-character;
1116     redo A;
1117     } elsif ($self->{nc} == -1) {
1118     !!!parse-error (type => 'unclosed attribute value');
1119     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1120     !!!cp (103);
1121     $self->{last_stag_name} = $self->{ct}->{tag_name};
1122     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1123     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1124     if ($self->{ct}->{attributes}) {
1125     !!!cp (104);
1126     !!!parse-error (type => 'end tag attribute');
1127     } else {
1128     ## NOTE: This state should never be reached.
1129     !!!cp (105);
1130     }
1131     } else {
1132     die "$0: $self->{ct}->{type}: Unknown token type";
1133     }
1134     $self->{state} = DATA_STATE;
1135     ## reconsume
1136    
1137     !!!emit ($self->{ct}); # start tag or end tag
1138    
1139     redo A;
1140     } else {
1141     !!!cp (106);
1142     $self->{ca}->{value} .= chr ($self->{nc});
1143     $self->{read_until}->($self->{ca}->{value},
1144     q['&],
1145     length $self->{ca}->{value});
1146    
1147     ## Stay in the state
1148     !!!next-input-character;
1149     redo A;
1150     }
1151     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1152     if ($is_space->{$self->{nc}}) {
1153     !!!cp (107);
1154     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1155     !!!next-input-character;
1156     redo A;
1157     } elsif ($self->{nc} == 0x0026) { # &
1158     !!!cp (108);
1159     ## NOTE: In the spec, the tokenizer is switched to the
1160     ## "entity in attribute value state". In this implementation, the
1161     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1162     ## implementation of the "consume a character reference" algorithm.
1163     $self->{entity_add} = -1;
1164     $self->{prev_state} = $self->{state};
1165     $self->{state} = ENTITY_STATE;
1166     !!!next-input-character;
1167     redo A;
1168     } elsif ($self->{nc} == 0x003E) { # >
1169     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1170     !!!cp (109);
1171     $self->{last_stag_name} = $self->{ct}->{tag_name};
1172     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1173     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1174     if ($self->{ct}->{attributes}) {
1175     !!!cp (110);
1176     !!!parse-error (type => 'end tag attribute');
1177     } else {
1178     ## NOTE: This state should never be reached.
1179     !!!cp (111);
1180     }
1181     } else {
1182     die "$0: $self->{ct}->{type}: Unknown token type";
1183     }
1184     $self->{state} = DATA_STATE;
1185     !!!next-input-character;
1186    
1187     !!!emit ($self->{ct}); # start tag or end tag
1188    
1189     redo A;
1190     } elsif ($self->{nc} == -1) {
1191     !!!parse-error (type => 'unclosed tag');
1192     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1193     !!!cp (112);
1194     $self->{last_stag_name} = $self->{ct}->{tag_name};
1195     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1196     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1197     if ($self->{ct}->{attributes}) {
1198     !!!cp (113);
1199     !!!parse-error (type => 'end tag attribute');
1200     } else {
1201     ## NOTE: This state should never be reached.
1202     !!!cp (114);
1203     }
1204     } else {
1205     die "$0: $self->{ct}->{type}: Unknown token type";
1206     }
1207     $self->{state} = DATA_STATE;
1208     ## reconsume
1209    
1210     !!!emit ($self->{ct}); # start tag or end tag
1211    
1212     redo A;
1213     } else {
1214     if ({
1215     0x0022 => 1, # "
1216     0x0027 => 1, # '
1217     0x003D => 1, # =
1218     }->{$self->{nc}}) {
1219     !!!cp (115);
1220     !!!parse-error (type => 'bad attribute value');
1221     } else {
1222     !!!cp (116);
1223     }
1224     $self->{ca}->{value} .= chr ($self->{nc});
1225     $self->{read_until}->($self->{ca}->{value},
1226     q["'=& >],
1227     length $self->{ca}->{value});
1228    
1229     ## Stay in the state
1230     !!!next-input-character;
1231     redo A;
1232     }
1233     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1234     if ($is_space->{$self->{nc}}) {
1235     !!!cp (118);
1236     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1237     !!!next-input-character;
1238     redo A;
1239     } elsif ($self->{nc} == 0x003E) { # >
1240     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1241     !!!cp (119);
1242     $self->{last_stag_name} = $self->{ct}->{tag_name};
1243     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1244     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1245     if ($self->{ct}->{attributes}) {
1246     !!!cp (120);
1247     !!!parse-error (type => 'end tag attribute');
1248     } else {
1249     ## NOTE: This state should never be reached.
1250     !!!cp (121);
1251     }
1252     } else {
1253     die "$0: $self->{ct}->{type}: Unknown token type";
1254     }
1255     $self->{state} = DATA_STATE;
1256     !!!next-input-character;
1257    
1258     !!!emit ($self->{ct}); # start tag or end tag
1259    
1260     redo A;
1261     } elsif ($self->{nc} == 0x002F) { # /
1262     !!!cp (122);
1263     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1264     !!!next-input-character;
1265     redo A;
1266     } elsif ($self->{nc} == -1) {
1267     !!!parse-error (type => 'unclosed tag');
1268     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1269     !!!cp (122.3);
1270     $self->{last_stag_name} = $self->{ct}->{tag_name};
1271     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1272     if ($self->{ct}->{attributes}) {
1273     !!!cp (122.1);
1274     !!!parse-error (type => 'end tag attribute');
1275     } else {
1276     ## NOTE: This state should never be reached.
1277     !!!cp (122.2);
1278     }
1279     } else {
1280     die "$0: $self->{ct}->{type}: Unknown token type";
1281     }
1282     $self->{state} = DATA_STATE;
1283     ## Reconsume.
1284     !!!emit ($self->{ct}); # start tag or end tag
1285     redo A;
1286     } else {
1287     !!!cp ('124.1');
1288     !!!parse-error (type => 'no space between attributes');
1289     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1290     ## reconsume
1291     redo A;
1292     }
1293     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1294     if ($self->{nc} == 0x003E) { # >
1295     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1296     !!!cp ('124.2');
1297     !!!parse-error (type => 'nestc', token => $self->{ct});
1298     ## TODO: Different type than slash in start tag
1299     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1300     if ($self->{ct}->{attributes}) {
1301     !!!cp ('124.4');
1302     !!!parse-error (type => 'end tag attribute');
1303     } else {
1304     !!!cp ('124.5');
1305     }
1306     ## TODO: Test |<title></title/>|
1307     } else {
1308     !!!cp ('124.3');
1309     $self->{self_closing} = 1;
1310     }
1311    
1312     $self->{state} = DATA_STATE;
1313     !!!next-input-character;
1314    
1315     !!!emit ($self->{ct}); # start tag or end tag
1316    
1317     redo A;
1318     } elsif ($self->{nc} == -1) {
1319     !!!parse-error (type => 'unclosed tag');
1320     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1321     !!!cp (124.7);
1322     $self->{last_stag_name} = $self->{ct}->{tag_name};
1323     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1324     if ($self->{ct}->{attributes}) {
1325     !!!cp (124.5);
1326     !!!parse-error (type => 'end tag attribute');
1327     } else {
1328     ## NOTE: This state should never be reached.
1329     !!!cp (124.6);
1330     }
1331     } else {
1332     die "$0: $self->{ct}->{type}: Unknown token type";
1333     }
1334     $self->{state} = DATA_STATE;
1335     ## Reconsume.
1336     !!!emit ($self->{ct}); # start tag or end tag
1337     redo A;
1338     } else {
1339     !!!cp ('124.4');
1340     !!!parse-error (type => 'nestc');
1341     ## TODO: This error type is wrong.
1342     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1343     ## Reconsume.
1344     redo A;
1345     }
1346     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1347     ## (only happen if PCDATA state)
1348    
1349     ## NOTE: Unlike spec's "bogus comment state", this implementation
1350     ## consumes characters one-by-one basis.
1351    
1352     if ($self->{nc} == 0x003E) { # >
1353     !!!cp (124);
1354     $self->{state} = DATA_STATE;
1355     !!!next-input-character;
1356    
1357     !!!emit ($self->{ct}); # comment
1358     redo A;
1359     } elsif ($self->{nc} == -1) {
1360     !!!cp (125);
1361     $self->{state} = DATA_STATE;
1362     ## reconsume
1363    
1364     !!!emit ($self->{ct}); # comment
1365     redo A;
1366     } else {
1367     !!!cp (126);
1368     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1369     $self->{read_until}->($self->{ct}->{data},
1370     q[>],
1371     length $self->{ct}->{data});
1372    
1373     ## Stay in the state.
1374     !!!next-input-character;
1375     redo A;
1376     }
1377     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1378     ## (only happen if PCDATA state)
1379    
1380     if ($self->{nc} == 0x002D) { # -
1381     !!!cp (133);
1382     $self->{state} = MD_HYPHEN_STATE;
1383     !!!next-input-character;
1384     redo A;
1385     } elsif ($self->{nc} == 0x0044 or # D
1386     $self->{nc} == 0x0064) { # d
1387     ## ASCII case-insensitive.
1388     !!!cp (130);
1389     $self->{state} = MD_DOCTYPE_STATE;
1390     $self->{s_kwd} = chr $self->{nc};
1391     !!!next-input-character;
1392     redo A;
1393     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1394     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1395     $self->{nc} == 0x005B) { # [
1396     !!!cp (135.4);
1397     $self->{state} = MD_CDATA_STATE;
1398     $self->{s_kwd} = '[';
1399     !!!next-input-character;
1400     redo A;
1401     } else {
1402     !!!cp (136);
1403     }
1404    
1405     !!!parse-error (type => 'bogus comment',
1406     line => $self->{line_prev},
1407     column => $self->{column_prev} - 1);
1408     ## Reconsume.
1409     $self->{state} = BOGUS_COMMENT_STATE;
1410     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1411     line => $self->{line_prev},
1412     column => $self->{column_prev} - 1,
1413     };
1414     redo A;
1415     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1416     if ($self->{nc} == 0x002D) { # -
1417     !!!cp (127);
1418     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1419     line => $self->{line_prev},
1420     column => $self->{column_prev} - 2,
1421     };
1422     $self->{state} = COMMENT_START_STATE;
1423     !!!next-input-character;
1424     redo A;
1425     } else {
1426     !!!cp (128);
1427     !!!parse-error (type => 'bogus comment',
1428     line => $self->{line_prev},
1429     column => $self->{column_prev} - 2);
1430     $self->{state} = BOGUS_COMMENT_STATE;
1431     ## Reconsume.
1432     $self->{ct} = {type => COMMENT_TOKEN,
1433     data => '-',
1434     line => $self->{line_prev},
1435     column => $self->{column_prev} - 2,
1436     };
1437     redo A;
1438     }
1439     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1440     ## ASCII case-insensitive.
1441     if ($self->{nc} == [
1442     undef,
1443     0x004F, # O
1444     0x0043, # C
1445     0x0054, # T
1446     0x0059, # Y
1447     0x0050, # P
1448     ]->[length $self->{s_kwd}] or
1449     $self->{nc} == [
1450     undef,
1451     0x006F, # o
1452     0x0063, # c
1453     0x0074, # t
1454     0x0079, # y
1455     0x0070, # p
1456     ]->[length $self->{s_kwd}]) {
1457     !!!cp (131);
1458     ## Stay in the state.
1459     $self->{s_kwd} .= chr $self->{nc};
1460     !!!next-input-character;
1461     redo A;
1462     } elsif ((length $self->{s_kwd}) == 6 and
1463     ($self->{nc} == 0x0045 or # E
1464     $self->{nc} == 0x0065)) { # e
1465     !!!cp (129);
1466     $self->{state} = DOCTYPE_STATE;
1467     $self->{ct} = {type => DOCTYPE_TOKEN,
1468     quirks => 1,
1469     line => $self->{line_prev},
1470     column => $self->{column_prev} - 7,
1471     };
1472     !!!next-input-character;
1473     redo A;
1474     } else {
1475     !!!cp (132);
1476     !!!parse-error (type => 'bogus comment',
1477     line => $self->{line_prev},
1478     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1479     $self->{state} = BOGUS_COMMENT_STATE;
1480     ## Reconsume.
1481     $self->{ct} = {type => COMMENT_TOKEN,
1482     data => $self->{s_kwd},
1483     line => $self->{line_prev},
1484     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1485     };
1486     redo A;
1487     }
1488     } elsif ($self->{state} == MD_CDATA_STATE) {
1489     if ($self->{nc} == {
1490     '[' => 0x0043, # C
1491     '[C' => 0x0044, # D
1492     '[CD' => 0x0041, # A
1493     '[CDA' => 0x0054, # T
1494     '[CDAT' => 0x0041, # A
1495     }->{$self->{s_kwd}}) {
1496     !!!cp (135.1);
1497     ## Stay in the state.
1498     $self->{s_kwd} .= chr $self->{nc};
1499     !!!next-input-character;
1500     redo A;
1501     } elsif ($self->{s_kwd} eq '[CDATA' and
1502     $self->{nc} == 0x005B) { # [
1503     !!!cp (135.2);
1504     $self->{ct} = {type => CHARACTER_TOKEN,
1505     data => '',
1506     line => $self->{line_prev},
1507     column => $self->{column_prev} - 7};
1508     $self->{state} = CDATA_SECTION_STATE;
1509     !!!next-input-character;
1510     redo A;
1511     } else {
1512     !!!cp (135.3);
1513     !!!parse-error (type => 'bogus comment',
1514     line => $self->{line_prev},
1515     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1516     $self->{state} = BOGUS_COMMENT_STATE;
1517     ## Reconsume.
1518     $self->{ct} = {type => COMMENT_TOKEN,
1519     data => $self->{s_kwd},
1520     line => $self->{line_prev},
1521     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1522     };
1523     redo A;
1524     }
1525     } elsif ($self->{state} == COMMENT_START_STATE) {
1526     if ($self->{nc} == 0x002D) { # -
1527     !!!cp (137);
1528     $self->{state} = COMMENT_START_DASH_STATE;
1529     !!!next-input-character;
1530     redo A;
1531     } elsif ($self->{nc} == 0x003E) { # >
1532     !!!cp (138);
1533     !!!parse-error (type => 'bogus comment');
1534     $self->{state} = DATA_STATE;
1535     !!!next-input-character;
1536    
1537     !!!emit ($self->{ct}); # comment
1538    
1539     redo A;
1540     } elsif ($self->{nc} == -1) {
1541     !!!cp (139);
1542     !!!parse-error (type => 'unclosed comment');
1543     $self->{state} = DATA_STATE;
1544     ## reconsume
1545    
1546     !!!emit ($self->{ct}); # comment
1547    
1548     redo A;
1549     } else {
1550     !!!cp (140);
1551     $self->{ct}->{data} # comment
1552     .= chr ($self->{nc});
1553     $self->{state} = COMMENT_STATE;
1554     !!!next-input-character;
1555     redo A;
1556     }
1557     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1558     if ($self->{nc} == 0x002D) { # -
1559     !!!cp (141);
1560     $self->{state} = COMMENT_END_STATE;
1561     !!!next-input-character;
1562     redo A;
1563     } elsif ($self->{nc} == 0x003E) { # >
1564     !!!cp (142);
1565     !!!parse-error (type => 'bogus comment');
1566     $self->{state} = DATA_STATE;
1567     !!!next-input-character;
1568    
1569     !!!emit ($self->{ct}); # comment
1570    
1571     redo A;
1572     } elsif ($self->{nc} == -1) {
1573     !!!cp (143);
1574     !!!parse-error (type => 'unclosed comment');
1575     $self->{state} = DATA_STATE;
1576     ## reconsume
1577    
1578     !!!emit ($self->{ct}); # comment
1579    
1580     redo A;
1581     } else {
1582     !!!cp (144);
1583     $self->{ct}->{data} # comment
1584     .= '-' . chr ($self->{nc});
1585     $self->{state} = COMMENT_STATE;
1586     !!!next-input-character;
1587     redo A;
1588     }
1589     } elsif ($self->{state} == COMMENT_STATE) {
1590     if ($self->{nc} == 0x002D) { # -
1591     !!!cp (145);
1592     $self->{state} = COMMENT_END_DASH_STATE;
1593     !!!next-input-character;
1594     redo A;
1595     } elsif ($self->{nc} == -1) {
1596     !!!cp (146);
1597     !!!parse-error (type => 'unclosed comment');
1598     $self->{state} = DATA_STATE;
1599     ## reconsume
1600    
1601     !!!emit ($self->{ct}); # comment
1602    
1603     redo A;
1604     } else {
1605     !!!cp (147);
1606     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1607     $self->{read_until}->($self->{ct}->{data},
1608     q[-],
1609     length $self->{ct}->{data});
1610    
1611     ## Stay in the state
1612     !!!next-input-character;
1613     redo A;
1614     }
1615     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1616     if ($self->{nc} == 0x002D) { # -
1617     !!!cp (148);
1618     $self->{state} = COMMENT_END_STATE;
1619     !!!next-input-character;
1620     redo A;
1621     } elsif ($self->{nc} == -1) {
1622     !!!cp (149);
1623     !!!parse-error (type => 'unclosed comment');
1624     $self->{state} = DATA_STATE;
1625     ## reconsume
1626    
1627     !!!emit ($self->{ct}); # comment
1628    
1629     redo A;
1630     } else {
1631     !!!cp (150);
1632     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1633     $self->{state} = COMMENT_STATE;
1634     !!!next-input-character;
1635     redo A;
1636     }
1637     } elsif ($self->{state} == COMMENT_END_STATE) {
1638     if ($self->{nc} == 0x003E) { # >
1639     !!!cp (151);
1640     $self->{state} = DATA_STATE;
1641     !!!next-input-character;
1642    
1643     !!!emit ($self->{ct}); # comment
1644    
1645     redo A;
1646     } elsif ($self->{nc} == 0x002D) { # -
1647     !!!cp (152);
1648     !!!parse-error (type => 'dash in comment',
1649     line => $self->{line_prev},
1650     column => $self->{column_prev});
1651     $self->{ct}->{data} .= '-'; # comment
1652     ## Stay in the state
1653     !!!next-input-character;
1654     redo A;
1655     } elsif ($self->{nc} == -1) {
1656     !!!cp (153);
1657     !!!parse-error (type => 'unclosed comment');
1658     $self->{state} = DATA_STATE;
1659     ## reconsume
1660    
1661     !!!emit ($self->{ct}); # comment
1662    
1663     redo A;
1664     } else {
1665     !!!cp (154);
1666     !!!parse-error (type => 'dash in comment',
1667     line => $self->{line_prev},
1668     column => $self->{column_prev});
1669     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1670     $self->{state} = COMMENT_STATE;
1671     !!!next-input-character;
1672     redo A;
1673     }
1674     } elsif ($self->{state} == DOCTYPE_STATE) {
1675     if ($is_space->{$self->{nc}}) {
1676     !!!cp (155);
1677     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1678     !!!next-input-character;
1679     redo A;
1680     } else {
1681     !!!cp (156);
1682     !!!parse-error (type => 'no space before DOCTYPE name');
1683     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1684     ## reconsume
1685     redo A;
1686     }
1687     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1688     if ($is_space->{$self->{nc}}) {
1689     !!!cp (157);
1690     ## Stay in the state
1691     !!!next-input-character;
1692     redo A;
1693     } elsif ($self->{nc} == 0x003E) { # >
1694     !!!cp (158);
1695     !!!parse-error (type => 'no DOCTYPE name');
1696     $self->{state} = DATA_STATE;
1697     !!!next-input-character;
1698    
1699     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1700    
1701     redo A;
1702     } elsif ($self->{nc} == -1) {
1703     !!!cp (159);
1704     !!!parse-error (type => 'no DOCTYPE name');
1705     $self->{state} = DATA_STATE;
1706     ## reconsume
1707    
1708     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1709    
1710     redo A;
1711     } else {
1712     !!!cp (160);
1713     $self->{ct}->{name} = chr $self->{nc};
1714     delete $self->{ct}->{quirks};
1715     $self->{state} = DOCTYPE_NAME_STATE;
1716     !!!next-input-character;
1717     redo A;
1718     }
1719     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1720     ## ISSUE: Redundant "First," in the spec.
1721     if ($is_space->{$self->{nc}}) {
1722     !!!cp (161);
1723     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1724     !!!next-input-character;
1725     redo A;
1726     } elsif ($self->{nc} == 0x003E) { # >
1727     !!!cp (162);
1728     $self->{state} = DATA_STATE;
1729     !!!next-input-character;
1730    
1731     !!!emit ($self->{ct}); # DOCTYPE
1732    
1733     redo A;
1734     } elsif ($self->{nc} == -1) {
1735     !!!cp (163);
1736     !!!parse-error (type => 'unclosed DOCTYPE');
1737     $self->{state} = DATA_STATE;
1738     ## reconsume
1739    
1740     $self->{ct}->{quirks} = 1;
1741     !!!emit ($self->{ct}); # DOCTYPE
1742    
1743     redo A;
1744     } else {
1745     !!!cp (164);
1746     $self->{ct}->{name}
1747     .= chr ($self->{nc}); # DOCTYPE
1748     ## Stay in the state
1749     !!!next-input-character;
1750     redo A;
1751     }
1752     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1753     if ($is_space->{$self->{nc}}) {
1754     !!!cp (165);
1755     ## Stay in the state
1756     !!!next-input-character;
1757     redo A;
1758     } elsif ($self->{nc} == 0x003E) { # >
1759     !!!cp (166);
1760     $self->{state} = DATA_STATE;
1761     !!!next-input-character;
1762    
1763     !!!emit ($self->{ct}); # DOCTYPE
1764    
1765     redo A;
1766     } elsif ($self->{nc} == -1) {
1767     !!!cp (167);
1768     !!!parse-error (type => 'unclosed DOCTYPE');
1769     $self->{state} = DATA_STATE;
1770     ## reconsume
1771    
1772     $self->{ct}->{quirks} = 1;
1773     !!!emit ($self->{ct}); # DOCTYPE
1774    
1775     redo A;
1776     } elsif ($self->{nc} == 0x0050 or # P
1777     $self->{nc} == 0x0070) { # p
1778     $self->{state} = PUBLIC_STATE;
1779     $self->{s_kwd} = chr $self->{nc};
1780     !!!next-input-character;
1781     redo A;
1782     } elsif ($self->{nc} == 0x0053 or # S
1783     $self->{nc} == 0x0073) { # s
1784     $self->{state} = SYSTEM_STATE;
1785     $self->{s_kwd} = chr $self->{nc};
1786     !!!next-input-character;
1787     redo A;
1788     } else {
1789     !!!cp (180);
1790     !!!parse-error (type => 'string after DOCTYPE name');
1791     $self->{ct}->{quirks} = 1;
1792    
1793     $self->{state} = BOGUS_DOCTYPE_STATE;
1794     !!!next-input-character;
1795     redo A;
1796     }
1797     } elsif ($self->{state} == PUBLIC_STATE) {
1798     ## ASCII case-insensitive
1799     if ($self->{nc} == [
1800     undef,
1801     0x0055, # U
1802     0x0042, # B
1803     0x004C, # L
1804     0x0049, # I
1805     ]->[length $self->{s_kwd}] or
1806     $self->{nc} == [
1807     undef,
1808     0x0075, # u
1809     0x0062, # b
1810     0x006C, # l
1811     0x0069, # i
1812     ]->[length $self->{s_kwd}]) {
1813     !!!cp (175);
1814     ## Stay in the state.
1815     $self->{s_kwd} .= chr $self->{nc};
1816     !!!next-input-character;
1817     redo A;
1818     } elsif ((length $self->{s_kwd}) == 5 and
1819     ($self->{nc} == 0x0043 or # C
1820     $self->{nc} == 0x0063)) { # c
1821     !!!cp (168);
1822     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1823     !!!next-input-character;
1824     redo A;
1825     } else {
1826     !!!cp (169);
1827     !!!parse-error (type => 'string after DOCTYPE name',
1828     line => $self->{line_prev},
1829     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1830     $self->{ct}->{quirks} = 1;
1831    
1832     $self->{state} = BOGUS_DOCTYPE_STATE;
1833     ## Reconsume.
1834     redo A;
1835     }
1836     } elsif ($self->{state} == SYSTEM_STATE) {
1837     ## ASCII case-insensitive
1838     if ($self->{nc} == [
1839     undef,
1840     0x0059, # Y
1841     0x0053, # S
1842     0x0054, # T
1843     0x0045, # E
1844     ]->[length $self->{s_kwd}] or
1845     $self->{nc} == [
1846     undef,
1847     0x0079, # y
1848     0x0073, # s
1849     0x0074, # t
1850     0x0065, # e
1851     ]->[length $self->{s_kwd}]) {
1852     !!!cp (170);
1853     ## Stay in the state.
1854     $self->{s_kwd} .= chr $self->{nc};
1855     !!!next-input-character;
1856     redo A;
1857     } elsif ((length $self->{s_kwd}) == 5 and
1858     ($self->{nc} == 0x004D or # M
1859     $self->{nc} == 0x006D)) { # m
1860     !!!cp (171);
1861     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1862     !!!next-input-character;
1863     redo A;
1864     } else {
1865     !!!cp (172);
1866     !!!parse-error (type => 'string after DOCTYPE name',
1867     line => $self->{line_prev},
1868     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1869     $self->{ct}->{quirks} = 1;
1870    
1871     $self->{state} = BOGUS_DOCTYPE_STATE;
1872     ## Reconsume.
1873     redo A;
1874     }
1875     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1876     if ($is_space->{$self->{nc}}) {
1877     !!!cp (181);
1878     ## Stay in the state
1879     !!!next-input-character;
1880     redo A;
1881     } elsif ($self->{nc} eq 0x0022) { # "
1882     !!!cp (182);
1883     $self->{ct}->{pubid} = ''; # DOCTYPE
1884     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1885     !!!next-input-character;
1886     redo A;
1887     } elsif ($self->{nc} eq 0x0027) { # '
1888     !!!cp (183);
1889     $self->{ct}->{pubid} = ''; # DOCTYPE
1890     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1891     !!!next-input-character;
1892     redo A;
1893     } elsif ($self->{nc} eq 0x003E) { # >
1894     !!!cp (184);
1895     !!!parse-error (type => 'no PUBLIC literal');
1896    
1897     $self->{state} = DATA_STATE;
1898     !!!next-input-character;
1899    
1900     $self->{ct}->{quirks} = 1;
1901     !!!emit ($self->{ct}); # DOCTYPE
1902    
1903     redo A;
1904     } elsif ($self->{nc} == -1) {
1905     !!!cp (185);
1906     !!!parse-error (type => 'unclosed DOCTYPE');
1907    
1908     $self->{state} = DATA_STATE;
1909     ## reconsume
1910    
1911     $self->{ct}->{quirks} = 1;
1912     !!!emit ($self->{ct}); # DOCTYPE
1913    
1914     redo A;
1915     } else {
1916     !!!cp (186);
1917     !!!parse-error (type => 'string after PUBLIC');
1918     $self->{ct}->{quirks} = 1;
1919    
1920     $self->{state} = BOGUS_DOCTYPE_STATE;
1921     !!!next-input-character;
1922     redo A;
1923     }
1924     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1925     if ($self->{nc} == 0x0022) { # "
1926     !!!cp (187);
1927     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1928     !!!next-input-character;
1929     redo A;
1930     } elsif ($self->{nc} == 0x003E) { # >
1931     !!!cp (188);
1932     !!!parse-error (type => 'unclosed PUBLIC literal');
1933    
1934     $self->{state} = DATA_STATE;
1935     !!!next-input-character;
1936    
1937     $self->{ct}->{quirks} = 1;
1938     !!!emit ($self->{ct}); # DOCTYPE
1939    
1940     redo A;
1941     } elsif ($self->{nc} == -1) {
1942     !!!cp (189);
1943     !!!parse-error (type => 'unclosed PUBLIC literal');
1944    
1945     $self->{state} = DATA_STATE;
1946     ## reconsume
1947    
1948     $self->{ct}->{quirks} = 1;
1949     !!!emit ($self->{ct}); # DOCTYPE
1950    
1951     redo A;
1952     } else {
1953     !!!cp (190);
1954     $self->{ct}->{pubid} # DOCTYPE
1955     .= chr $self->{nc};
1956     $self->{read_until}->($self->{ct}->{pubid}, q[">],
1957     length $self->{ct}->{pubid});
1958    
1959     ## Stay in the state
1960     !!!next-input-character;
1961     redo A;
1962     }
1963     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1964     if ($self->{nc} == 0x0027) { # '
1965     !!!cp (191);
1966     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1967     !!!next-input-character;
1968     redo A;
1969     } elsif ($self->{nc} == 0x003E) { # >
1970     !!!cp (192);
1971     !!!parse-error (type => 'unclosed PUBLIC literal');
1972    
1973     $self->{state} = DATA_STATE;
1974     !!!next-input-character;
1975    
1976     $self->{ct}->{quirks} = 1;
1977     !!!emit ($self->{ct}); # DOCTYPE
1978    
1979     redo A;
1980     } elsif ($self->{nc} == -1) {
1981     !!!cp (193);
1982     !!!parse-error (type => 'unclosed PUBLIC literal');
1983    
1984     $self->{state} = DATA_STATE;
1985     ## reconsume
1986    
1987     $self->{ct}->{quirks} = 1;
1988     !!!emit ($self->{ct}); # DOCTYPE
1989    
1990     redo A;
1991     } else {
1992     !!!cp (194);
1993     $self->{ct}->{pubid} # DOCTYPE
1994     .= chr $self->{nc};
1995     $self->{read_until}->($self->{ct}->{pubid}, q['>],
1996     length $self->{ct}->{pubid});
1997    
1998     ## Stay in the state
1999     !!!next-input-character;
2000     redo A;
2001     }
2002     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2003     if ($is_space->{$self->{nc}}) {
2004     !!!cp (195);
2005     ## Stay in the state
2006     !!!next-input-character;
2007     redo A;
2008     } elsif ($self->{nc} == 0x0022) { # "
2009     !!!cp (196);
2010     $self->{ct}->{sysid} = ''; # DOCTYPE
2011     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2012     !!!next-input-character;
2013     redo A;
2014     } elsif ($self->{nc} == 0x0027) { # '
2015     !!!cp (197);
2016     $self->{ct}->{sysid} = ''; # DOCTYPE
2017     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2018     !!!next-input-character;
2019     redo A;
2020     } elsif ($self->{nc} == 0x003E) { # >
2021     !!!cp (198);
2022     $self->{state} = DATA_STATE;
2023     !!!next-input-character;
2024    
2025     !!!emit ($self->{ct}); # DOCTYPE
2026    
2027     redo A;
2028     } elsif ($self->{nc} == -1) {
2029     !!!cp (199);
2030     !!!parse-error (type => 'unclosed DOCTYPE');
2031    
2032     $self->{state} = DATA_STATE;
2033     ## reconsume
2034    
2035     $self->{ct}->{quirks} = 1;
2036     !!!emit ($self->{ct}); # DOCTYPE
2037    
2038     redo A;
2039     } else {
2040     !!!cp (200);
2041     !!!parse-error (type => 'string after PUBLIC literal');
2042     $self->{ct}->{quirks} = 1;
2043    
2044     $self->{state} = BOGUS_DOCTYPE_STATE;
2045     !!!next-input-character;
2046     redo A;
2047     }
2048     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2049     if ($is_space->{$self->{nc}}) {
2050     !!!cp (201);
2051     ## Stay in the state
2052     !!!next-input-character;
2053     redo A;
2054     } elsif ($self->{nc} == 0x0022) { # "
2055     !!!cp (202);
2056     $self->{ct}->{sysid} = ''; # DOCTYPE
2057     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2058     !!!next-input-character;
2059     redo A;
2060     } elsif ($self->{nc} == 0x0027) { # '
2061     !!!cp (203);
2062     $self->{ct}->{sysid} = ''; # DOCTYPE
2063     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2064     !!!next-input-character;
2065     redo A;
2066     } elsif ($self->{nc} == 0x003E) { # >
2067     !!!cp (204);
2068     !!!parse-error (type => 'no SYSTEM literal');
2069     $self->{state} = DATA_STATE;
2070     !!!next-input-character;
2071    
2072     $self->{ct}->{quirks} = 1;
2073     !!!emit ($self->{ct}); # DOCTYPE
2074    
2075     redo A;
2076     } elsif ($self->{nc} == -1) {
2077     !!!cp (205);
2078     !!!parse-error (type => 'unclosed DOCTYPE');
2079    
2080     $self->{state} = DATA_STATE;
2081     ## reconsume
2082    
2083     $self->{ct}->{quirks} = 1;
2084     !!!emit ($self->{ct}); # DOCTYPE
2085    
2086     redo A;
2087     } else {
2088     !!!cp (206);
2089     !!!parse-error (type => 'string after SYSTEM');
2090     $self->{ct}->{quirks} = 1;
2091    
2092     $self->{state} = BOGUS_DOCTYPE_STATE;
2093     !!!next-input-character;
2094     redo A;
2095     }
2096     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2097     if ($self->{nc} == 0x0022) { # "
2098     !!!cp (207);
2099     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2100     !!!next-input-character;
2101     redo A;
2102     } elsif ($self->{nc} == 0x003E) { # >
2103     !!!cp (208);
2104     !!!parse-error (type => 'unclosed SYSTEM literal');
2105    
2106     $self->{state} = DATA_STATE;
2107     !!!next-input-character;
2108    
2109     $self->{ct}->{quirks} = 1;
2110     !!!emit ($self->{ct}); # DOCTYPE
2111    
2112     redo A;
2113     } elsif ($self->{nc} == -1) {
2114     !!!cp (209);
2115     !!!parse-error (type => 'unclosed SYSTEM literal');
2116    
2117     $self->{state} = DATA_STATE;
2118     ## reconsume
2119    
2120     $self->{ct}->{quirks} = 1;
2121     !!!emit ($self->{ct}); # DOCTYPE
2122    
2123     redo A;
2124     } else {
2125     !!!cp (210);
2126     $self->{ct}->{sysid} # DOCTYPE
2127     .= chr $self->{nc};
2128     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2129     length $self->{ct}->{sysid});
2130    
2131     ## Stay in the state
2132     !!!next-input-character;
2133     redo A;
2134     }
2135     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2136     if ($self->{nc} == 0x0027) { # '
2137     !!!cp (211);
2138     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2139     !!!next-input-character;
2140     redo A;
2141     } elsif ($self->{nc} == 0x003E) { # >
2142     !!!cp (212);
2143     !!!parse-error (type => 'unclosed SYSTEM literal');
2144    
2145     $self->{state} = DATA_STATE;
2146     !!!next-input-character;
2147    
2148     $self->{ct}->{quirks} = 1;
2149     !!!emit ($self->{ct}); # DOCTYPE
2150    
2151     redo A;
2152     } elsif ($self->{nc} == -1) {
2153     !!!cp (213);
2154     !!!parse-error (type => 'unclosed SYSTEM literal');
2155    
2156     $self->{state} = DATA_STATE;
2157     ## reconsume
2158    
2159     $self->{ct}->{quirks} = 1;
2160     !!!emit ($self->{ct}); # DOCTYPE
2161    
2162     redo A;
2163     } else {
2164     !!!cp (214);
2165     $self->{ct}->{sysid} # DOCTYPE
2166     .= chr $self->{nc};
2167     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2168     length $self->{ct}->{sysid});
2169    
2170     ## Stay in the state
2171     !!!next-input-character;
2172     redo A;
2173     }
2174     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2175     if ($is_space->{$self->{nc}}) {
2176     !!!cp (215);
2177     ## Stay in the state
2178     !!!next-input-character;
2179     redo A;
2180     } elsif ($self->{nc} == 0x003E) { # >
2181     !!!cp (216);
2182     $self->{state} = DATA_STATE;
2183     !!!next-input-character;
2184    
2185     !!!emit ($self->{ct}); # DOCTYPE
2186    
2187     redo A;
2188     } elsif ($self->{nc} == -1) {
2189     !!!cp (217);
2190     !!!parse-error (type => 'unclosed DOCTYPE');
2191     $self->{state} = DATA_STATE;
2192     ## reconsume
2193    
2194     $self->{ct}->{quirks} = 1;
2195     !!!emit ($self->{ct}); # DOCTYPE
2196    
2197     redo A;
2198     } else {
2199     !!!cp (218);
2200     !!!parse-error (type => 'string after SYSTEM literal');
2201     #$self->{ct}->{quirks} = 1;
2202    
2203     $self->{state} = BOGUS_DOCTYPE_STATE;
2204     !!!next-input-character;
2205     redo A;
2206     }
2207     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2208     if ($self->{nc} == 0x003E) { # >
2209     !!!cp (219);
2210     $self->{state} = DATA_STATE;
2211     !!!next-input-character;
2212    
2213     !!!emit ($self->{ct}); # DOCTYPE
2214    
2215     redo A;
2216     } elsif ($self->{nc} == -1) {
2217     !!!cp (220);
2218     $self->{state} = DATA_STATE;
2219     ## reconsume
2220    
2221     !!!emit ($self->{ct}); # DOCTYPE
2222    
2223     redo A;
2224     } else {
2225     !!!cp (221);
2226     my $s = '';
2227     $self->{read_until}->($s, q[>], 0);
2228    
2229     ## Stay in the state
2230     !!!next-input-character;
2231     redo A;
2232     }
2233     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2234     ## NOTE: "CDATA section state" in the state is jointly implemented
2235     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2236     ## and |CDATA_SECTION_MSE2_STATE|.
2237    
2238     if ($self->{nc} == 0x005D) { # ]
2239     !!!cp (221.1);
2240     $self->{state} = CDATA_SECTION_MSE1_STATE;
2241     !!!next-input-character;
2242     redo A;
2243     } elsif ($self->{nc} == -1) {
2244     $self->{state} = DATA_STATE;
2245     !!!next-input-character;
2246     if (length $self->{ct}->{data}) { # character
2247     !!!cp (221.2);
2248     !!!emit ($self->{ct}); # character
2249     } else {
2250     !!!cp (221.3);
2251     ## No token to emit. $self->{ct} is discarded.
2252     }
2253     redo A;
2254     } else {
2255     !!!cp (221.4);
2256     $self->{ct}->{data} .= chr $self->{nc};
2257     $self->{read_until}->($self->{ct}->{data},
2258     q<]>,
2259     length $self->{ct}->{data});
2260    
2261     ## Stay in the state.
2262     !!!next-input-character;
2263     redo A;
2264     }
2265    
2266     ## ISSUE: "text tokens" in spec.
2267     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2268     if ($self->{nc} == 0x005D) { # ]
2269     !!!cp (221.5);
2270     $self->{state} = CDATA_SECTION_MSE2_STATE;
2271     !!!next-input-character;
2272     redo A;
2273     } else {
2274     !!!cp (221.6);
2275     $self->{ct}->{data} .= ']';
2276     $self->{state} = CDATA_SECTION_STATE;
2277     ## Reconsume.
2278     redo A;
2279     }
2280     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2281     if ($self->{nc} == 0x003E) { # >
2282     $self->{state} = DATA_STATE;
2283     !!!next-input-character;
2284     if (length $self->{ct}->{data}) { # character
2285     !!!cp (221.7);
2286     !!!emit ($self->{ct}); # character
2287     } else {
2288     !!!cp (221.8);
2289     ## No token to emit. $self->{ct} is discarded.
2290     }
2291     redo A;
2292     } elsif ($self->{nc} == 0x005D) { # ]
2293     !!!cp (221.9); # character
2294     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2295     ## Stay in the state.
2296     !!!next-input-character;
2297     redo A;
2298     } else {
2299     !!!cp (221.11);
2300     $self->{ct}->{data} .= ']]'; # character
2301     $self->{state} = CDATA_SECTION_STATE;
2302     ## Reconsume.
2303     redo A;
2304     }
2305     } elsif ($self->{state} == ENTITY_STATE) {
2306     if ($is_space->{$self->{nc}} or
2307     {
2308     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2309     $self->{entity_add} => 1,
2310     }->{$self->{nc}}) {
2311     !!!cp (1001);
2312     ## Don't consume
2313     ## No error
2314     ## Return nothing.
2315     #
2316     } elsif ($self->{nc} == 0x0023) { # #
2317     !!!cp (999);
2318     $self->{state} = ENTITY_HASH_STATE;
2319     $self->{s_kwd} = '#';
2320     !!!next-input-character;
2321     redo A;
2322     } elsif ((0x0041 <= $self->{nc} and
2323     $self->{nc} <= 0x005A) or # A..Z
2324     (0x0061 <= $self->{nc} and
2325     $self->{nc} <= 0x007A)) { # a..z
2326     !!!cp (998);
2327     require Whatpm::_NamedEntityList;
2328     $self->{state} = ENTITY_NAME_STATE;
2329     $self->{s_kwd} = chr $self->{nc};
2330     $self->{entity__value} = $self->{s_kwd};
2331     $self->{entity__match} = 0;
2332     !!!next-input-character;
2333     redo A;
2334     } else {
2335     !!!cp (1027);
2336     !!!parse-error (type => 'bare ero');
2337     ## Return nothing.
2338     #
2339     }
2340    
2341     ## NOTE: No character is consumed by the "consume a character
2342     ## reference" algorithm. In other word, there is an "&" character
2343     ## that does not introduce a character reference, which would be
2344     ## appended to the parent element or the attribute value in later
2345     ## process of the tokenizer.
2346    
2347     if ($self->{prev_state} == DATA_STATE) {
2348     !!!cp (997);
2349     $self->{state} = $self->{prev_state};
2350     ## Reconsume.
2351     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2352     line => $self->{line_prev},
2353     column => $self->{column_prev},
2354     });
2355     redo A;
2356     } else {
2357     !!!cp (996);
2358     $self->{ca}->{value} .= '&';
2359     $self->{state} = $self->{prev_state};
2360     ## Reconsume.
2361     redo A;
2362     }
2363     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2364     if ($self->{nc} == 0x0078 or # x
2365     $self->{nc} == 0x0058) { # X
2366     !!!cp (995);
2367     $self->{state} = HEXREF_X_STATE;
2368     $self->{s_kwd} .= chr $self->{nc};
2369     !!!next-input-character;
2370     redo A;
2371     } elsif (0x0030 <= $self->{nc} and
2372     $self->{nc} <= 0x0039) { # 0..9
2373     !!!cp (994);
2374     $self->{state} = NCR_NUM_STATE;
2375     $self->{s_kwd} = $self->{nc} - 0x0030;
2376     !!!next-input-character;
2377     redo A;
2378     } else {
2379     !!!parse-error (type => 'bare nero',
2380     line => $self->{line_prev},
2381     column => $self->{column_prev} - 1);
2382    
2383     ## NOTE: According to the spec algorithm, nothing is returned,
2384     ## and then "&#" is appended to the parent element or the attribute
2385     ## value in the later processing.
2386    
2387     if ($self->{prev_state} == DATA_STATE) {
2388     !!!cp (1019);
2389     $self->{state} = $self->{prev_state};
2390     ## Reconsume.
2391     !!!emit ({type => CHARACTER_TOKEN,
2392     data => '&#',
2393     line => $self->{line_prev},
2394     column => $self->{column_prev} - 1,
2395     });
2396     redo A;
2397     } else {
2398     !!!cp (993);
2399     $self->{ca}->{value} .= '&#';
2400     $self->{state} = $self->{prev_state};
2401     ## Reconsume.
2402     redo A;
2403     }
2404     }
2405     } elsif ($self->{state} == NCR_NUM_STATE) {
2406     if (0x0030 <= $self->{nc} and
2407     $self->{nc} <= 0x0039) { # 0..9
2408     !!!cp (1012);
2409     $self->{s_kwd} *= 10;
2410     $self->{s_kwd} += $self->{nc} - 0x0030;
2411    
2412     ## Stay in the state.
2413     !!!next-input-character;
2414     redo A;
2415     } elsif ($self->{nc} == 0x003B) { # ;
2416     !!!cp (1013);
2417     !!!next-input-character;
2418     #
2419     } else {
2420     !!!cp (1014);
2421     !!!parse-error (type => 'no refc');
2422     ## Reconsume.
2423     #
2424     }
2425    
2426     my $code = $self->{s_kwd};
2427     my $l = $self->{line_prev};
2428     my $c = $self->{column_prev};
2429     if ($charref_map->{$code}) {
2430     !!!cp (1015);
2431     !!!parse-error (type => 'invalid character reference',
2432     text => (sprintf 'U+%04X', $code),
2433     line => $l, column => $c);
2434     $code = $charref_map->{$code};
2435     } elsif ($code > 0x10FFFF) {
2436     !!!cp (1016);
2437     !!!parse-error (type => 'invalid character reference',
2438     text => (sprintf 'U-%08X', $code),
2439     line => $l, column => $c);
2440     $code = 0xFFFD;
2441     }
2442    
2443     if ($self->{prev_state} == DATA_STATE) {
2444     !!!cp (992);
2445     $self->{state} = $self->{prev_state};
2446     ## Reconsume.
2447     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2448     line => $l, column => $c,
2449     });
2450     redo A;
2451     } else {
2452     !!!cp (991);
2453     $self->{ca}->{value} .= chr $code;
2454     $self->{ca}->{has_reference} = 1;
2455     $self->{state} = $self->{prev_state};
2456     ## Reconsume.
2457     redo A;
2458     }
2459     } elsif ($self->{state} == HEXREF_X_STATE) {
2460     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2461     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2462     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2463     # 0..9, A..F, a..f
2464     !!!cp (990);
2465     $self->{state} = HEXREF_HEX_STATE;
2466     $self->{s_kwd} = 0;
2467     ## Reconsume.
2468     redo A;
2469     } else {
2470     !!!parse-error (type => 'bare hcro',
2471     line => $self->{line_prev},
2472     column => $self->{column_prev} - 2);
2473    
2474     ## NOTE: According to the spec algorithm, nothing is returned,
2475     ## and then "&#" followed by "X" or "x" is appended to the parent
2476     ## element or the attribute value in the later processing.
2477    
2478     if ($self->{prev_state} == DATA_STATE) {
2479     !!!cp (1005);
2480     $self->{state} = $self->{prev_state};
2481     ## Reconsume.
2482     !!!emit ({type => CHARACTER_TOKEN,
2483     data => '&' . $self->{s_kwd},
2484     line => $self->{line_prev},
2485     column => $self->{column_prev} - length $self->{s_kwd},
2486     });
2487     redo A;
2488     } else {
2489     !!!cp (989);
2490     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2491     $self->{state} = $self->{prev_state};
2492     ## Reconsume.
2493     redo A;
2494     }
2495     }
2496     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2497     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2498     # 0..9
2499     !!!cp (1002);
2500     $self->{s_kwd} *= 0x10;
2501     $self->{s_kwd} += $self->{nc} - 0x0030;
2502     ## Stay in the state.
2503     !!!next-input-character;
2504     redo A;
2505     } elsif (0x0061 <= $self->{nc} and
2506     $self->{nc} <= 0x0066) { # a..f
2507     !!!cp (1003);
2508     $self->{s_kwd} *= 0x10;
2509     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2510     ## Stay in the state.
2511     !!!next-input-character;
2512     redo A;
2513     } elsif (0x0041 <= $self->{nc} and
2514     $self->{nc} <= 0x0046) { # A..F
2515     !!!cp (1004);
2516     $self->{s_kwd} *= 0x10;
2517     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2518     ## Stay in the state.
2519     !!!next-input-character;
2520     redo A;
2521     } elsif ($self->{nc} == 0x003B) { # ;
2522     !!!cp (1006);
2523     !!!next-input-character;
2524     #
2525     } else {
2526     !!!cp (1007);
2527     !!!parse-error (type => 'no refc',
2528     line => $self->{line},
2529     column => $self->{column});
2530     ## Reconsume.
2531     #
2532     }
2533    
2534     my $code = $self->{s_kwd};
2535     my $l = $self->{line_prev};
2536     my $c = $self->{column_prev};
2537     if ($charref_map->{$code}) {
2538     !!!cp (1008);
2539     !!!parse-error (type => 'invalid character reference',
2540     text => (sprintf 'U+%04X', $code),
2541     line => $l, column => $c);
2542     $code = $charref_map->{$code};
2543     } elsif ($code > 0x10FFFF) {
2544     !!!cp (1009);
2545     !!!parse-error (type => 'invalid character reference',
2546     text => (sprintf 'U-%08X', $code),
2547     line => $l, column => $c);
2548     $code = 0xFFFD;
2549     }
2550    
2551     if ($self->{prev_state} == DATA_STATE) {
2552     !!!cp (988);
2553     $self->{state} = $self->{prev_state};
2554     ## Reconsume.
2555     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2556     line => $l, column => $c,
2557     });
2558     redo A;
2559     } else {
2560     !!!cp (987);
2561     $self->{ca}->{value} .= chr $code;
2562     $self->{ca}->{has_reference} = 1;
2563     $self->{state} = $self->{prev_state};
2564     ## Reconsume.
2565     redo A;
2566     }
2567     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2568     if (length $self->{s_kwd} < 30 and
2569     ## NOTE: Some number greater than the maximum length of entity name
2570     ((0x0041 <= $self->{nc} and # a
2571     $self->{nc} <= 0x005A) or # x
2572     (0x0061 <= $self->{nc} and # a
2573     $self->{nc} <= 0x007A) or # z
2574     (0x0030 <= $self->{nc} and # 0
2575     $self->{nc} <= 0x0039) or # 9
2576     $self->{nc} == 0x003B)) { # ;
2577     our $EntityChar;
2578     $self->{s_kwd} .= chr $self->{nc};
2579     if (defined $EntityChar->{$self->{s_kwd}}) {
2580     if ($self->{nc} == 0x003B) { # ;
2581     !!!cp (1020);
2582     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2583     $self->{entity__match} = 1;
2584     !!!next-input-character;
2585     #
2586     } else {
2587     !!!cp (1021);
2588     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2589     $self->{entity__match} = -1;
2590     ## Stay in the state.
2591     !!!next-input-character;
2592     redo A;
2593     }
2594     } else {
2595     !!!cp (1022);
2596     $self->{entity__value} .= chr $self->{nc};
2597     $self->{entity__match} *= 2;
2598     ## Stay in the state.
2599     !!!next-input-character;
2600     redo A;
2601     }
2602     }
2603    
2604     my $data;
2605     my $has_ref;
2606     if ($self->{entity__match} > 0) {
2607     !!!cp (1023);
2608     $data = $self->{entity__value};
2609     $has_ref = 1;
2610     #
2611     } elsif ($self->{entity__match} < 0) {
2612     !!!parse-error (type => 'no refc');
2613     if ($self->{prev_state} != DATA_STATE and # in attribute
2614     $self->{entity__match} < -1) {
2615     !!!cp (1024);
2616     $data = '&' . $self->{s_kwd};
2617     #
2618     } else {
2619     !!!cp (1025);
2620     $data = $self->{entity__value};
2621     $has_ref = 1;
2622     #
2623     }
2624     } else {
2625     !!!cp (1026);
2626     !!!parse-error (type => 'bare ero',
2627     line => $self->{line_prev},
2628     column => $self->{column_prev} - length $self->{s_kwd});
2629     $data = '&' . $self->{s_kwd};
2630     #
2631     }
2632    
2633     ## NOTE: In these cases, when a character reference is found,
2634     ## it is consumed and a character token is returned, or, otherwise,
2635     ## nothing is consumed and returned, according to the spec algorithm.
2636     ## In this implementation, anything that has been examined by the
2637     ## tokenizer is appended to the parent element or the attribute value
2638     ## as string, either literal string when no character reference or
2639     ## entity-replaced string otherwise, in this stage, since any characters
2640     ## that would not be consumed are appended in the data state or in an
2641     ## appropriate attribute value state anyway.
2642    
2643     if ($self->{prev_state} == DATA_STATE) {
2644     !!!cp (986);
2645     $self->{state} = $self->{prev_state};
2646     ## Reconsume.
2647     !!!emit ({type => CHARACTER_TOKEN,
2648     data => $data,
2649     line => $self->{line_prev},
2650     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2651     });
2652     redo A;
2653     } else {
2654     !!!cp (985);
2655     $self->{ca}->{value} .= $data;
2656     $self->{ca}->{has_reference} = 1 if $has_ref;
2657     $self->{state} = $self->{prev_state};
2658     ## Reconsume.
2659     redo A;
2660     }
2661     } else {
2662     die "$0: $self->{state}: Unknown state";
2663     }
2664     } # A
2665    
2666     die "$0: _get_next_token: unexpected case";
2667     } # _get_next_token
2668    
2669     1;
2670 wakaba 1.2 ## $Date: 2008/10/14 02:27:58 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24