/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download) (as text)
Tue Oct 14 05:34:05 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.2: +6 -4 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 05:33:48 -0000
	* Tokenizer.pm.src: Introduced "in_xml" flag for CDATA section
	support in XML.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 05:34:00 -0000
	* Parser.pm.src: Set |in_xml| flag for tokenizer.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: A bug on end tag handling fixed.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.3 our $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181     #$self->{s_kwd}; # state keyword - initialized when used
182     #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194     !!!next-input-character;
195     $self->{token} = [];
196     # $self->{escape}
197     } # _initialize_tokenizer
198    
199     ## A token has:
200     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202     ## ->{name} (DOCTYPE_TOKEN)
203     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204     ## ->{pubid} (DOCTYPE_TOKEN)
205     ## ->{sysid} (DOCTYPE_TOKEN)
206     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208     ## ->{name}
209     ## ->{value}
210     ## ->{has_reference} == 1 or 0
211     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214     ## while the token is pushed back to the stack.
215    
216     ## Emitted token MUST immediately be handled by the tree construction state.
217    
218     ## Before each step, UA MAY check to see if either one of the scripts in
219     ## "list of scripts that will execute as soon as possible" or the first
220     ## script in the "list of scripts that will execute asynchronously",
221     ## has completed loading. If one has, then it MUST be executed
222     ## and removed from the list.
223    
224     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225     ## (This requirement was dropped from HTML5 spec, unfortunately.)
226    
227     my $is_space = {
228     0x0009 => 1, # CHARACTER TABULATION (HT)
229     0x000A => 1, # LINE FEED (LF)
230     #0x000B => 0, # LINE TABULATION (VT)
231     0x000C => 1, # FORM FEED (FF)
232     #0x000D => 1, # CARRIAGE RETURN (CR)
233     0x0020 => 1, # SPACE (SP)
234     };
235    
236     sub _get_next_token ($) {
237     my $self = shift;
238    
239     if ($self->{self_closing}) {
240     !!!parse-error (type => 'nestc', token => $self->{ct});
241     ## NOTE: The |self_closing| flag is only set by start tag token.
242     ## In addition, when a start tag token is emitted, it is always set to
243     ## |ct|.
244     delete $self->{self_closing};
245     }
246    
247     if (@{$self->{token}}) {
248     $self->{self_closing} = $self->{token}->[0]->{self_closing};
249     return shift @{$self->{token}};
250     }
251    
252     A: {
253     if ($self->{state} == PCDATA_STATE) {
254     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255    
256     if ($self->{nc} == 0x0026) { # &
257     !!!cp (0.1);
258     ## NOTE: In the spec, the tokenizer is switched to the
259     ## "entity data state". In this implementation, the tokenizer
260     ## is switched to the |ENTITY_STATE|, which is an implementation
261     ## of the "consume a character reference" algorithm.
262     $self->{entity_add} = -1;
263     $self->{prev_state} = DATA_STATE;
264     $self->{state} = ENTITY_STATE;
265     !!!next-input-character;
266     redo A;
267     } elsif ($self->{nc} == 0x003C) { # <
268     !!!cp (0.2);
269     $self->{state} = TAG_OPEN_STATE;
270     !!!next-input-character;
271     redo A;
272     } elsif ($self->{nc} == -1) {
273     !!!cp (0.3);
274     !!!emit ({type => END_OF_FILE_TOKEN,
275     line => $self->{line}, column => $self->{column}});
276     last A; ## TODO: ok?
277     } else {
278     !!!cp (0.4);
279     #
280     }
281    
282     # Anything else
283     my $token = {type => CHARACTER_TOKEN,
284     data => chr $self->{nc},
285     line => $self->{line}, column => $self->{column},
286     };
287     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288    
289     ## Stay in the state.
290     !!!next-input-character;
291     !!!emit ($token);
292     redo A;
293     } elsif ($self->{state} == DATA_STATE) {
294     $self->{s_kwd} = '' unless defined $self->{s_kwd};
295     if ($self->{nc} == 0x0026) { # &
296     $self->{s_kwd} = '';
297     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298     not $self->{escape}) {
299     !!!cp (1);
300     ## NOTE: In the spec, the tokenizer is switched to the
301     ## "entity data state". In this implementation, the tokenizer
302     ## is switched to the |ENTITY_STATE|, which is an implementation
303     ## of the "consume a character reference" algorithm.
304     $self->{entity_add} = -1;
305     $self->{prev_state} = DATA_STATE;
306     $self->{state} = ENTITY_STATE;
307     !!!next-input-character;
308     redo A;
309     } else {
310     !!!cp (2);
311     #
312     }
313     } elsif ($self->{nc} == 0x002D) { # -
314     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315     $self->{s_kwd} .= '-';
316    
317     if ($self->{s_kwd} eq '<!--') {
318     !!!cp (3);
319     $self->{escape} = 1; # unless $self->{escape};
320     $self->{s_kwd} = '--';
321     #
322     } elsif ($self->{s_kwd} eq '---') {
323     !!!cp (4);
324     $self->{s_kwd} = '--';
325     #
326     } else {
327     !!!cp (5);
328     #
329     }
330     }
331    
332     #
333     } elsif ($self->{nc} == 0x0021) { # !
334     if (length $self->{s_kwd}) {
335     !!!cp (5.1);
336     $self->{s_kwd} .= '!';
337     #
338     } else {
339     !!!cp (5.2);
340     #$self->{s_kwd} = '';
341     #
342     }
343     #
344     } elsif ($self->{nc} == 0x003C) { # <
345     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
346     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
347     not $self->{escape})) {
348     !!!cp (6);
349     $self->{state} = TAG_OPEN_STATE;
350     !!!next-input-character;
351     redo A;
352     } else {
353     !!!cp (7);
354     $self->{s_kwd} = '';
355     #
356     }
357     } elsif ($self->{nc} == 0x003E) { # >
358     if ($self->{escape} and
359     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
360     if ($self->{s_kwd} eq '--') {
361     !!!cp (8);
362     delete $self->{escape};
363     } else {
364     !!!cp (9);
365     }
366     } else {
367     !!!cp (10);
368     }
369    
370     $self->{s_kwd} = '';
371     #
372     } elsif ($self->{nc} == -1) {
373     !!!cp (11);
374     $self->{s_kwd} = '';
375     !!!emit ({type => END_OF_FILE_TOKEN,
376     line => $self->{line}, column => $self->{column}});
377     last A; ## TODO: ok?
378     } else {
379     !!!cp (12);
380     $self->{s_kwd} = '';
381     #
382     }
383    
384     # Anything else
385     my $token = {type => CHARACTER_TOKEN,
386     data => chr $self->{nc},
387     line => $self->{line}, column => $self->{column},
388     };
389     if ($self->{read_until}->($token->{data}, q[-!<>&],
390     length $token->{data})) {
391     $self->{s_kwd} = '';
392     }
393    
394     ## Stay in the data state.
395     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
396     !!!cp (13);
397     $self->{state} = PCDATA_STATE;
398     } else {
399     !!!cp (14);
400     ## Stay in the state.
401     }
402     !!!next-input-character;
403     !!!emit ($token);
404     redo A;
405     } elsif ($self->{state} == TAG_OPEN_STATE) {
406     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
407     if ($self->{nc} == 0x002F) { # /
408     !!!cp (15);
409     !!!next-input-character;
410     $self->{state} = CLOSE_TAG_OPEN_STATE;
411     redo A;
412     } elsif ($self->{nc} == 0x0021) { # !
413     !!!cp (15.1);
414     $self->{s_kwd} = '<' unless $self->{escape};
415     #
416     } else {
417     !!!cp (16);
418     #
419     }
420    
421     ## reconsume
422     $self->{state} = DATA_STATE;
423     !!!emit ({type => CHARACTER_TOKEN, data => '<',
424     line => $self->{line_prev},
425     column => $self->{column_prev},
426     });
427     redo A;
428     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
429     if ($self->{nc} == 0x0021) { # !
430     !!!cp (17);
431     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
432     !!!next-input-character;
433     redo A;
434     } elsif ($self->{nc} == 0x002F) { # /
435     !!!cp (18);
436     $self->{state} = CLOSE_TAG_OPEN_STATE;
437     !!!next-input-character;
438     redo A;
439     } elsif (0x0041 <= $self->{nc} and
440     $self->{nc} <= 0x005A) { # A..Z
441     !!!cp (19);
442     $self->{ct}
443     = {type => START_TAG_TOKEN,
444     tag_name => chr ($self->{nc} + 0x0020),
445     line => $self->{line_prev},
446     column => $self->{column_prev}};
447     $self->{state} = TAG_NAME_STATE;
448     !!!next-input-character;
449     redo A;
450     } elsif (0x0061 <= $self->{nc} and
451     $self->{nc} <= 0x007A) { # a..z
452     !!!cp (20);
453     $self->{ct} = {type => START_TAG_TOKEN,
454     tag_name => chr ($self->{nc}),
455     line => $self->{line_prev},
456     column => $self->{column_prev}};
457     $self->{state} = TAG_NAME_STATE;
458     !!!next-input-character;
459     redo A;
460     } elsif ($self->{nc} == 0x003E) { # >
461     !!!cp (21);
462     !!!parse-error (type => 'empty start tag',
463     line => $self->{line_prev},
464     column => $self->{column_prev});
465     $self->{state} = DATA_STATE;
466     !!!next-input-character;
467    
468     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
469     line => $self->{line_prev},
470     column => $self->{column_prev},
471     });
472    
473     redo A;
474     } elsif ($self->{nc} == 0x003F) { # ?
475     !!!cp (22);
476     !!!parse-error (type => 'pio',
477     line => $self->{line_prev},
478     column => $self->{column_prev});
479     $self->{state} = BOGUS_COMMENT_STATE;
480     $self->{ct} = {type => COMMENT_TOKEN, data => '',
481     line => $self->{line_prev},
482     column => $self->{column_prev},
483     };
484     ## $self->{nc} is intentionally left as is
485     redo A;
486     } else {
487     !!!cp (23);
488     !!!parse-error (type => 'bare stago',
489     line => $self->{line_prev},
490     column => $self->{column_prev});
491     $self->{state} = DATA_STATE;
492     ## reconsume
493    
494     !!!emit ({type => CHARACTER_TOKEN, data => '<',
495     line => $self->{line_prev},
496     column => $self->{column_prev},
497     });
498    
499     redo A;
500     }
501     } else {
502     die "$0: $self->{content_model} in tag open";
503     }
504     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
505     ## NOTE: The "close tag open state" in the spec is implemented as
506     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
507    
508     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
509     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
510     if (defined $self->{last_stag_name}) {
511     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
512     $self->{s_kwd} = '';
513     ## Reconsume.
514     redo A;
515     } else {
516     ## No start tag token has ever been emitted
517     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
518     !!!cp (28);
519     $self->{state} = DATA_STATE;
520     ## Reconsume.
521     !!!emit ({type => CHARACTER_TOKEN, data => '</',
522     line => $l, column => $c,
523     });
524     redo A;
525     }
526     }
527    
528     if (0x0041 <= $self->{nc} and
529     $self->{nc} <= 0x005A) { # A..Z
530     !!!cp (29);
531     $self->{ct}
532     = {type => END_TAG_TOKEN,
533     tag_name => chr ($self->{nc} + 0x0020),
534     line => $l, column => $c};
535     $self->{state} = TAG_NAME_STATE;
536     !!!next-input-character;
537     redo A;
538     } elsif (0x0061 <= $self->{nc} and
539     $self->{nc} <= 0x007A) { # a..z
540     !!!cp (30);
541     $self->{ct} = {type => END_TAG_TOKEN,
542     tag_name => chr ($self->{nc}),
543     line => $l, column => $c};
544     $self->{state} = TAG_NAME_STATE;
545     !!!next-input-character;
546     redo A;
547     } elsif ($self->{nc} == 0x003E) { # >
548     !!!cp (31);
549     !!!parse-error (type => 'empty end tag',
550     line => $self->{line_prev}, ## "<" in "</>"
551     column => $self->{column_prev} - 1);
552     $self->{state} = DATA_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif ($self->{nc} == -1) {
556     !!!cp (32);
557     !!!parse-error (type => 'bare etago');
558     $self->{state} = DATA_STATE;
559     # reconsume
560    
561     !!!emit ({type => CHARACTER_TOKEN, data => '</',
562     line => $l, column => $c,
563     });
564    
565     redo A;
566     } else {
567     !!!cp (33);
568     !!!parse-error (type => 'bogus end tag');
569     $self->{state} = BOGUS_COMMENT_STATE;
570     $self->{ct} = {type => COMMENT_TOKEN, data => '',
571     line => $self->{line_prev}, # "<" of "</"
572     column => $self->{column_prev} - 1,
573     };
574     ## NOTE: $self->{nc} is intentionally left as is.
575     ## Although the "anything else" case of the spec not explicitly
576     ## states that the next input character is to be reconsumed,
577     ## it will be included to the |data| of the comment token
578     ## generated from the bogus end tag, as defined in the
579     ## "bogus comment state" entry.
580     redo A;
581     }
582     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
583     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
584     if (length $ch) {
585     my $CH = $ch;
586     $ch =~ tr/a-z/A-Z/;
587     my $nch = chr $self->{nc};
588     if ($nch eq $ch or $nch eq $CH) {
589     !!!cp (24);
590     ## Stay in the state.
591     $self->{s_kwd} .= $nch;
592     !!!next-input-character;
593     redo A;
594     } else {
595     !!!cp (25);
596     $self->{state} = DATA_STATE;
597     ## Reconsume.
598     !!!emit ({type => CHARACTER_TOKEN,
599     data => '</' . $self->{s_kwd},
600     line => $self->{line_prev},
601     column => $self->{column_prev} - 1 - length $self->{s_kwd},
602     });
603     redo A;
604     }
605     } else { # after "<{tag-name}"
606     unless ($is_space->{$self->{nc}} or
607     {
608     0x003E => 1, # >
609     0x002F => 1, # /
610     -1 => 1, # EOF
611     }->{$self->{nc}}) {
612     !!!cp (26);
613     ## Reconsume.
614     $self->{state} = DATA_STATE;
615     !!!emit ({type => CHARACTER_TOKEN,
616     data => '</' . $self->{s_kwd},
617     line => $self->{line_prev},
618     column => $self->{column_prev} - 1 - length $self->{s_kwd},
619     });
620     redo A;
621     } else {
622     !!!cp (27);
623     $self->{ct}
624     = {type => END_TAG_TOKEN,
625     tag_name => $self->{last_stag_name},
626     line => $self->{line_prev},
627     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
628     $self->{state} = TAG_NAME_STATE;
629     ## Reconsume.
630     redo A;
631     }
632     }
633     } elsif ($self->{state} == TAG_NAME_STATE) {
634     if ($is_space->{$self->{nc}}) {
635     !!!cp (34);
636     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
637     !!!next-input-character;
638     redo A;
639     } elsif ($self->{nc} == 0x003E) { # >
640     if ($self->{ct}->{type} == START_TAG_TOKEN) {
641     !!!cp (35);
642     $self->{last_stag_name} = $self->{ct}->{tag_name};
643     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
644     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645     #if ($self->{ct}->{attributes}) {
646     # ## NOTE: This should never be reached.
647     # !!! cp (36);
648     # !!! parse-error (type => 'end tag attribute');
649     #} else {
650     !!!cp (37);
651     #}
652     } else {
653     die "$0: $self->{ct}->{type}: Unknown token type";
654     }
655     $self->{state} = DATA_STATE;
656     !!!next-input-character;
657    
658     !!!emit ($self->{ct}); # start tag or end tag
659    
660     redo A;
661     } elsif (0x0041 <= $self->{nc} and
662     $self->{nc} <= 0x005A) { # A..Z
663     !!!cp (38);
664     $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
665     # start tag or end tag
666     ## Stay in this state
667     !!!next-input-character;
668     redo A;
669     } elsif ($self->{nc} == -1) {
670     !!!parse-error (type => 'unclosed tag');
671     if ($self->{ct}->{type} == START_TAG_TOKEN) {
672     !!!cp (39);
673     $self->{last_stag_name} = $self->{ct}->{tag_name};
674     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
675     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
676     #if ($self->{ct}->{attributes}) {
677     # ## NOTE: This state should never be reached.
678     # !!! cp (40);
679     # !!! parse-error (type => 'end tag attribute');
680     #} else {
681     !!!cp (41);
682     #}
683     } else {
684     die "$0: $self->{ct}->{type}: Unknown token type";
685     }
686     $self->{state} = DATA_STATE;
687     # reconsume
688    
689     !!!emit ($self->{ct}); # start tag or end tag
690    
691     redo A;
692     } elsif ($self->{nc} == 0x002F) { # /
693     !!!cp (42);
694     $self->{state} = SELF_CLOSING_START_TAG_STATE;
695     !!!next-input-character;
696     redo A;
697     } else {
698     !!!cp (44);
699     $self->{ct}->{tag_name} .= chr $self->{nc};
700     # start tag or end tag
701     ## Stay in the state
702     !!!next-input-character;
703     redo A;
704     }
705     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
706     if ($is_space->{$self->{nc}}) {
707     !!!cp (45);
708     ## Stay in the state
709     !!!next-input-character;
710     redo A;
711     } elsif ($self->{nc} == 0x003E) { # >
712     if ($self->{ct}->{type} == START_TAG_TOKEN) {
713     !!!cp (46);
714     $self->{last_stag_name} = $self->{ct}->{tag_name};
715     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
716     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
717     if ($self->{ct}->{attributes}) {
718     !!!cp (47);
719     !!!parse-error (type => 'end tag attribute');
720     } else {
721     !!!cp (48);
722     }
723     } else {
724     die "$0: $self->{ct}->{type}: Unknown token type";
725     }
726     $self->{state} = DATA_STATE;
727     !!!next-input-character;
728    
729     !!!emit ($self->{ct}); # start tag or end tag
730    
731     redo A;
732     } elsif (0x0041 <= $self->{nc} and
733     $self->{nc} <= 0x005A) { # A..Z
734     !!!cp (49);
735     $self->{ca}
736     = {name => chr ($self->{nc} + 0x0020),
737     value => '',
738     line => $self->{line}, column => $self->{column}};
739     $self->{state} = ATTRIBUTE_NAME_STATE;
740     !!!next-input-character;
741     redo A;
742     } elsif ($self->{nc} == 0x002F) { # /
743     !!!cp (50);
744     $self->{state} = SELF_CLOSING_START_TAG_STATE;
745     !!!next-input-character;
746     redo A;
747     } elsif ($self->{nc} == -1) {
748     !!!parse-error (type => 'unclosed tag');
749     if ($self->{ct}->{type} == START_TAG_TOKEN) {
750     !!!cp (52);
751     $self->{last_stag_name} = $self->{ct}->{tag_name};
752     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
753     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
754     if ($self->{ct}->{attributes}) {
755     !!!cp (53);
756     !!!parse-error (type => 'end tag attribute');
757     } else {
758     !!!cp (54);
759     }
760     } else {
761     die "$0: $self->{ct}->{type}: Unknown token type";
762     }
763     $self->{state} = DATA_STATE;
764     # reconsume
765    
766     !!!emit ($self->{ct}); # start tag or end tag
767    
768     redo A;
769     } else {
770     if ({
771     0x0022 => 1, # "
772     0x0027 => 1, # '
773     0x003D => 1, # =
774     }->{$self->{nc}}) {
775     !!!cp (55);
776     !!!parse-error (type => 'bad attribute name');
777     } else {
778     !!!cp (56);
779     }
780     $self->{ca}
781     = {name => chr ($self->{nc}),
782     value => '',
783     line => $self->{line}, column => $self->{column}};
784     $self->{state} = ATTRIBUTE_NAME_STATE;
785     !!!next-input-character;
786     redo A;
787     }
788     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
789     my $before_leave = sub {
790     if (exists $self->{ct}->{attributes} # start tag or end tag
791     ->{$self->{ca}->{name}}) { # MUST
792     !!!cp (57);
793     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
794     ## Discard $self->{ca} # MUST
795     } else {
796     !!!cp (58);
797     $self->{ct}->{attributes}->{$self->{ca}->{name}}
798     = $self->{ca};
799     }
800     }; # $before_leave
801    
802     if ($is_space->{$self->{nc}}) {
803     !!!cp (59);
804     $before_leave->();
805     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
806     !!!next-input-character;
807     redo A;
808     } elsif ($self->{nc} == 0x003D) { # =
809     !!!cp (60);
810     $before_leave->();
811     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
812     !!!next-input-character;
813     redo A;
814     } elsif ($self->{nc} == 0x003E) { # >
815     $before_leave->();
816     if ($self->{ct}->{type} == START_TAG_TOKEN) {
817     !!!cp (61);
818     $self->{last_stag_name} = $self->{ct}->{tag_name};
819     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
820     !!!cp (62);
821     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
822     if ($self->{ct}->{attributes}) {
823     !!!parse-error (type => 'end tag attribute');
824     }
825     } else {
826     die "$0: $self->{ct}->{type}: Unknown token type";
827     }
828     $self->{state} = DATA_STATE;
829     !!!next-input-character;
830    
831     !!!emit ($self->{ct}); # start tag or end tag
832    
833     redo A;
834     } elsif (0x0041 <= $self->{nc} and
835     $self->{nc} <= 0x005A) { # A..Z
836     !!!cp (63);
837     $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
838     ## Stay in the state
839     !!!next-input-character;
840     redo A;
841     } elsif ($self->{nc} == 0x002F) { # /
842     !!!cp (64);
843     $before_leave->();
844     $self->{state} = SELF_CLOSING_START_TAG_STATE;
845     !!!next-input-character;
846     redo A;
847     } elsif ($self->{nc} == -1) {
848     !!!parse-error (type => 'unclosed tag');
849     $before_leave->();
850     if ($self->{ct}->{type} == START_TAG_TOKEN) {
851     !!!cp (66);
852     $self->{last_stag_name} = $self->{ct}->{tag_name};
853     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
854     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
855     if ($self->{ct}->{attributes}) {
856     !!!cp (67);
857     !!!parse-error (type => 'end tag attribute');
858     } else {
859     ## NOTE: This state should never be reached.
860     !!!cp (68);
861     }
862     } else {
863     die "$0: $self->{ct}->{type}: Unknown token type";
864     }
865     $self->{state} = DATA_STATE;
866     # reconsume
867    
868     !!!emit ($self->{ct}); # start tag or end tag
869    
870     redo A;
871     } else {
872     if ($self->{nc} == 0x0022 or # "
873     $self->{nc} == 0x0027) { # '
874     !!!cp (69);
875     !!!parse-error (type => 'bad attribute name');
876     } else {
877     !!!cp (70);
878     }
879     $self->{ca}->{name} .= chr ($self->{nc});
880     ## Stay in the state
881     !!!next-input-character;
882     redo A;
883     }
884     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
885     if ($is_space->{$self->{nc}}) {
886     !!!cp (71);
887     ## Stay in the state
888     !!!next-input-character;
889     redo A;
890     } elsif ($self->{nc} == 0x003D) { # =
891     !!!cp (72);
892     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
893     !!!next-input-character;
894     redo A;
895     } elsif ($self->{nc} == 0x003E) { # >
896     if ($self->{ct}->{type} == START_TAG_TOKEN) {
897     !!!cp (73);
898     $self->{last_stag_name} = $self->{ct}->{tag_name};
899     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
900     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
901     if ($self->{ct}->{attributes}) {
902     !!!cp (74);
903     !!!parse-error (type => 'end tag attribute');
904     } else {
905     ## NOTE: This state should never be reached.
906     !!!cp (75);
907     }
908     } else {
909     die "$0: $self->{ct}->{type}: Unknown token type";
910     }
911     $self->{state} = DATA_STATE;
912     !!!next-input-character;
913    
914     !!!emit ($self->{ct}); # start tag or end tag
915    
916     redo A;
917     } elsif (0x0041 <= $self->{nc} and
918     $self->{nc} <= 0x005A) { # A..Z
919     !!!cp (76);
920     $self->{ca}
921     = {name => chr ($self->{nc} + 0x0020),
922     value => '',
923     line => $self->{line}, column => $self->{column}};
924     $self->{state} = ATTRIBUTE_NAME_STATE;
925     !!!next-input-character;
926     redo A;
927     } elsif ($self->{nc} == 0x002F) { # /
928     !!!cp (77);
929     $self->{state} = SELF_CLOSING_START_TAG_STATE;
930     !!!next-input-character;
931     redo A;
932     } elsif ($self->{nc} == -1) {
933     !!!parse-error (type => 'unclosed tag');
934     if ($self->{ct}->{type} == START_TAG_TOKEN) {
935     !!!cp (79);
936     $self->{last_stag_name} = $self->{ct}->{tag_name};
937     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
938     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
939     if ($self->{ct}->{attributes}) {
940     !!!cp (80);
941     !!!parse-error (type => 'end tag attribute');
942     } else {
943     ## NOTE: This state should never be reached.
944     !!!cp (81);
945     }
946     } else {
947     die "$0: $self->{ct}->{type}: Unknown token type";
948     }
949     $self->{state} = DATA_STATE;
950     # reconsume
951    
952     !!!emit ($self->{ct}); # start tag or end tag
953    
954     redo A;
955     } else {
956     if ($self->{nc} == 0x0022 or # "
957     $self->{nc} == 0x0027) { # '
958     !!!cp (78);
959     !!!parse-error (type => 'bad attribute name');
960     } else {
961     !!!cp (82);
962     }
963     $self->{ca}
964     = {name => chr ($self->{nc}),
965     value => '',
966     line => $self->{line}, column => $self->{column}};
967     $self->{state} = ATTRIBUTE_NAME_STATE;
968     !!!next-input-character;
969     redo A;
970     }
971     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
972     if ($is_space->{$self->{nc}}) {
973     !!!cp (83);
974     ## Stay in the state
975     !!!next-input-character;
976     redo A;
977     } elsif ($self->{nc} == 0x0022) { # "
978     !!!cp (84);
979     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
980     !!!next-input-character;
981     redo A;
982     } elsif ($self->{nc} == 0x0026) { # &
983     !!!cp (85);
984     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
985     ## reconsume
986     redo A;
987     } elsif ($self->{nc} == 0x0027) { # '
988     !!!cp (86);
989     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
990     !!!next-input-character;
991     redo A;
992     } elsif ($self->{nc} == 0x003E) { # >
993     !!!parse-error (type => 'empty unquoted attribute value');
994     if ($self->{ct}->{type} == START_TAG_TOKEN) {
995     !!!cp (87);
996     $self->{last_stag_name} = $self->{ct}->{tag_name};
997     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
998     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
999     if ($self->{ct}->{attributes}) {
1000     !!!cp (88);
1001     !!!parse-error (type => 'end tag attribute');
1002     } else {
1003     ## NOTE: This state should never be reached.
1004     !!!cp (89);
1005     }
1006     } else {
1007     die "$0: $self->{ct}->{type}: Unknown token type";
1008     }
1009     $self->{state} = DATA_STATE;
1010     !!!next-input-character;
1011    
1012     !!!emit ($self->{ct}); # start tag or end tag
1013    
1014     redo A;
1015     } elsif ($self->{nc} == -1) {
1016     !!!parse-error (type => 'unclosed tag');
1017     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1018     !!!cp (90);
1019     $self->{last_stag_name} = $self->{ct}->{tag_name};
1020     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1021     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1022     if ($self->{ct}->{attributes}) {
1023     !!!cp (91);
1024     !!!parse-error (type => 'end tag attribute');
1025     } else {
1026     ## NOTE: This state should never be reached.
1027     !!!cp (92);
1028     }
1029     } else {
1030     die "$0: $self->{ct}->{type}: Unknown token type";
1031     }
1032     $self->{state} = DATA_STATE;
1033     ## reconsume
1034    
1035     !!!emit ($self->{ct}); # start tag or end tag
1036    
1037     redo A;
1038     } else {
1039     if ($self->{nc} == 0x003D) { # =
1040     !!!cp (93);
1041     !!!parse-error (type => 'bad attribute value');
1042     } else {
1043     !!!cp (94);
1044     }
1045     $self->{ca}->{value} .= chr ($self->{nc});
1046     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1047     !!!next-input-character;
1048     redo A;
1049     }
1050     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1051     if ($self->{nc} == 0x0022) { # "
1052     !!!cp (95);
1053     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1054     !!!next-input-character;
1055     redo A;
1056     } elsif ($self->{nc} == 0x0026) { # &
1057     !!!cp (96);
1058     ## NOTE: In the spec, the tokenizer is switched to the
1059     ## "entity in attribute value state". In this implementation, the
1060     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1061     ## implementation of the "consume a character reference" algorithm.
1062     $self->{prev_state} = $self->{state};
1063     $self->{entity_add} = 0x0022; # "
1064     $self->{state} = ENTITY_STATE;
1065     !!!next-input-character;
1066     redo A;
1067     } elsif ($self->{nc} == -1) {
1068     !!!parse-error (type => 'unclosed attribute value');
1069     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1070     !!!cp (97);
1071     $self->{last_stag_name} = $self->{ct}->{tag_name};
1072     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1073     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1074     if ($self->{ct}->{attributes}) {
1075     !!!cp (98);
1076     !!!parse-error (type => 'end tag attribute');
1077     } else {
1078     ## NOTE: This state should never be reached.
1079     !!!cp (99);
1080     }
1081     } else {
1082     die "$0: $self->{ct}->{type}: Unknown token type";
1083     }
1084     $self->{state} = DATA_STATE;
1085     ## reconsume
1086    
1087     !!!emit ($self->{ct}); # start tag or end tag
1088    
1089     redo A;
1090     } else {
1091     !!!cp (100);
1092     $self->{ca}->{value} .= chr ($self->{nc});
1093     $self->{read_until}->($self->{ca}->{value},
1094     q["&],
1095     length $self->{ca}->{value});
1096    
1097     ## Stay in the state
1098     !!!next-input-character;
1099     redo A;
1100     }
1101     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1102     if ($self->{nc} == 0x0027) { # '
1103     !!!cp (101);
1104     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1105     !!!next-input-character;
1106     redo A;
1107     } elsif ($self->{nc} == 0x0026) { # &
1108     !!!cp (102);
1109     ## NOTE: In the spec, the tokenizer is switched to the
1110     ## "entity in attribute value state". In this implementation, the
1111     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1112     ## implementation of the "consume a character reference" algorithm.
1113     $self->{entity_add} = 0x0027; # '
1114     $self->{prev_state} = $self->{state};
1115     $self->{state} = ENTITY_STATE;
1116     !!!next-input-character;
1117     redo A;
1118     } elsif ($self->{nc} == -1) {
1119     !!!parse-error (type => 'unclosed attribute value');
1120     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1121     !!!cp (103);
1122     $self->{last_stag_name} = $self->{ct}->{tag_name};
1123     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1124     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1125     if ($self->{ct}->{attributes}) {
1126     !!!cp (104);
1127     !!!parse-error (type => 'end tag attribute');
1128     } else {
1129     ## NOTE: This state should never be reached.
1130     !!!cp (105);
1131     }
1132     } else {
1133     die "$0: $self->{ct}->{type}: Unknown token type";
1134     }
1135     $self->{state} = DATA_STATE;
1136     ## reconsume
1137    
1138     !!!emit ($self->{ct}); # start tag or end tag
1139    
1140     redo A;
1141     } else {
1142     !!!cp (106);
1143     $self->{ca}->{value} .= chr ($self->{nc});
1144     $self->{read_until}->($self->{ca}->{value},
1145     q['&],
1146     length $self->{ca}->{value});
1147    
1148     ## Stay in the state
1149     !!!next-input-character;
1150     redo A;
1151     }
1152     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1153     if ($is_space->{$self->{nc}}) {
1154     !!!cp (107);
1155     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1156     !!!next-input-character;
1157     redo A;
1158     } elsif ($self->{nc} == 0x0026) { # &
1159     !!!cp (108);
1160     ## NOTE: In the spec, the tokenizer is switched to the
1161     ## "entity in attribute value state". In this implementation, the
1162     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1163     ## implementation of the "consume a character reference" algorithm.
1164     $self->{entity_add} = -1;
1165     $self->{prev_state} = $self->{state};
1166     $self->{state} = ENTITY_STATE;
1167     !!!next-input-character;
1168     redo A;
1169     } elsif ($self->{nc} == 0x003E) { # >
1170     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171     !!!cp (109);
1172     $self->{last_stag_name} = $self->{ct}->{tag_name};
1173     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175     if ($self->{ct}->{attributes}) {
1176     !!!cp (110);
1177     !!!parse-error (type => 'end tag attribute');
1178     } else {
1179     ## NOTE: This state should never be reached.
1180     !!!cp (111);
1181     }
1182     } else {
1183     die "$0: $self->{ct}->{type}: Unknown token type";
1184     }
1185     $self->{state} = DATA_STATE;
1186     !!!next-input-character;
1187    
1188     !!!emit ($self->{ct}); # start tag or end tag
1189    
1190     redo A;
1191     } elsif ($self->{nc} == -1) {
1192     !!!parse-error (type => 'unclosed tag');
1193     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1194     !!!cp (112);
1195     $self->{last_stag_name} = $self->{ct}->{tag_name};
1196     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1197     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1198     if ($self->{ct}->{attributes}) {
1199     !!!cp (113);
1200     !!!parse-error (type => 'end tag attribute');
1201     } else {
1202     ## NOTE: This state should never be reached.
1203     !!!cp (114);
1204     }
1205     } else {
1206     die "$0: $self->{ct}->{type}: Unknown token type";
1207     }
1208     $self->{state} = DATA_STATE;
1209     ## reconsume
1210    
1211     !!!emit ($self->{ct}); # start tag or end tag
1212    
1213     redo A;
1214     } else {
1215     if ({
1216     0x0022 => 1, # "
1217     0x0027 => 1, # '
1218     0x003D => 1, # =
1219     }->{$self->{nc}}) {
1220     !!!cp (115);
1221     !!!parse-error (type => 'bad attribute value');
1222     } else {
1223     !!!cp (116);
1224     }
1225     $self->{ca}->{value} .= chr ($self->{nc});
1226     $self->{read_until}->($self->{ca}->{value},
1227     q["'=& >],
1228     length $self->{ca}->{value});
1229    
1230     ## Stay in the state
1231     !!!next-input-character;
1232     redo A;
1233     }
1234     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1235     if ($is_space->{$self->{nc}}) {
1236     !!!cp (118);
1237     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1238     !!!next-input-character;
1239     redo A;
1240     } elsif ($self->{nc} == 0x003E) { # >
1241     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1242     !!!cp (119);
1243     $self->{last_stag_name} = $self->{ct}->{tag_name};
1244     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1245     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1246     if ($self->{ct}->{attributes}) {
1247     !!!cp (120);
1248     !!!parse-error (type => 'end tag attribute');
1249     } else {
1250     ## NOTE: This state should never be reached.
1251     !!!cp (121);
1252     }
1253     } else {
1254     die "$0: $self->{ct}->{type}: Unknown token type";
1255     }
1256     $self->{state} = DATA_STATE;
1257     !!!next-input-character;
1258    
1259     !!!emit ($self->{ct}); # start tag or end tag
1260    
1261     redo A;
1262     } elsif ($self->{nc} == 0x002F) { # /
1263     !!!cp (122);
1264     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1265     !!!next-input-character;
1266     redo A;
1267     } elsif ($self->{nc} == -1) {
1268     !!!parse-error (type => 'unclosed tag');
1269     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1270     !!!cp (122.3);
1271     $self->{last_stag_name} = $self->{ct}->{tag_name};
1272     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1273     if ($self->{ct}->{attributes}) {
1274     !!!cp (122.1);
1275     !!!parse-error (type => 'end tag attribute');
1276     } else {
1277     ## NOTE: This state should never be reached.
1278     !!!cp (122.2);
1279     }
1280     } else {
1281     die "$0: $self->{ct}->{type}: Unknown token type";
1282     }
1283     $self->{state} = DATA_STATE;
1284     ## Reconsume.
1285     !!!emit ($self->{ct}); # start tag or end tag
1286     redo A;
1287     } else {
1288     !!!cp ('124.1');
1289     !!!parse-error (type => 'no space between attributes');
1290     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1291     ## reconsume
1292     redo A;
1293     }
1294     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1295     if ($self->{nc} == 0x003E) { # >
1296     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1297     !!!cp ('124.2');
1298     !!!parse-error (type => 'nestc', token => $self->{ct});
1299     ## TODO: Different type than slash in start tag
1300     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1301     if ($self->{ct}->{attributes}) {
1302     !!!cp ('124.4');
1303     !!!parse-error (type => 'end tag attribute');
1304     } else {
1305     !!!cp ('124.5');
1306     }
1307     ## TODO: Test |<title></title/>|
1308     } else {
1309     !!!cp ('124.3');
1310     $self->{self_closing} = 1;
1311     }
1312    
1313     $self->{state} = DATA_STATE;
1314     !!!next-input-character;
1315    
1316     !!!emit ($self->{ct}); # start tag or end tag
1317    
1318     redo A;
1319     } elsif ($self->{nc} == -1) {
1320     !!!parse-error (type => 'unclosed tag');
1321     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322     !!!cp (124.7);
1323     $self->{last_stag_name} = $self->{ct}->{tag_name};
1324     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1325     if ($self->{ct}->{attributes}) {
1326     !!!cp (124.5);
1327     !!!parse-error (type => 'end tag attribute');
1328     } else {
1329     ## NOTE: This state should never be reached.
1330     !!!cp (124.6);
1331     }
1332     } else {
1333     die "$0: $self->{ct}->{type}: Unknown token type";
1334     }
1335     $self->{state} = DATA_STATE;
1336     ## Reconsume.
1337     !!!emit ($self->{ct}); # start tag or end tag
1338     redo A;
1339     } else {
1340     !!!cp ('124.4');
1341     !!!parse-error (type => 'nestc');
1342     ## TODO: This error type is wrong.
1343     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1344     ## Reconsume.
1345     redo A;
1346     }
1347     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1348     ## (only happen if PCDATA state)
1349    
1350     ## NOTE: Unlike spec's "bogus comment state", this implementation
1351     ## consumes characters one-by-one basis.
1352    
1353     if ($self->{nc} == 0x003E) { # >
1354     !!!cp (124);
1355     $self->{state} = DATA_STATE;
1356     !!!next-input-character;
1357    
1358     !!!emit ($self->{ct}); # comment
1359     redo A;
1360     } elsif ($self->{nc} == -1) {
1361     !!!cp (125);
1362     $self->{state} = DATA_STATE;
1363     ## reconsume
1364    
1365     !!!emit ($self->{ct}); # comment
1366     redo A;
1367     } else {
1368     !!!cp (126);
1369     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1370     $self->{read_until}->($self->{ct}->{data},
1371     q[>],
1372     length $self->{ct}->{data});
1373    
1374     ## Stay in the state.
1375     !!!next-input-character;
1376     redo A;
1377     }
1378     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1379     ## (only happen if PCDATA state)
1380    
1381     if ($self->{nc} == 0x002D) { # -
1382     !!!cp (133);
1383     $self->{state} = MD_HYPHEN_STATE;
1384     !!!next-input-character;
1385     redo A;
1386     } elsif ($self->{nc} == 0x0044 or # D
1387     $self->{nc} == 0x0064) { # d
1388     ## ASCII case-insensitive.
1389     !!!cp (130);
1390     $self->{state} = MD_DOCTYPE_STATE;
1391     $self->{s_kwd} = chr $self->{nc};
1392     !!!next-input-character;
1393     redo A;
1394 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1395     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1396     $self->{is_xml}) and
1397 wakaba 1.1 $self->{nc} == 0x005B) { # [
1398     !!!cp (135.4);
1399     $self->{state} = MD_CDATA_STATE;
1400     $self->{s_kwd} = '[';
1401     !!!next-input-character;
1402     redo A;
1403     } else {
1404     !!!cp (136);
1405     }
1406    
1407     !!!parse-error (type => 'bogus comment',
1408     line => $self->{line_prev},
1409     column => $self->{column_prev} - 1);
1410     ## Reconsume.
1411     $self->{state} = BOGUS_COMMENT_STATE;
1412     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1413     line => $self->{line_prev},
1414     column => $self->{column_prev} - 1,
1415     };
1416     redo A;
1417     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1418     if ($self->{nc} == 0x002D) { # -
1419     !!!cp (127);
1420     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1421     line => $self->{line_prev},
1422     column => $self->{column_prev} - 2,
1423     };
1424     $self->{state} = COMMENT_START_STATE;
1425     !!!next-input-character;
1426     redo A;
1427     } else {
1428     !!!cp (128);
1429     !!!parse-error (type => 'bogus comment',
1430     line => $self->{line_prev},
1431     column => $self->{column_prev} - 2);
1432     $self->{state} = BOGUS_COMMENT_STATE;
1433     ## Reconsume.
1434     $self->{ct} = {type => COMMENT_TOKEN,
1435     data => '-',
1436     line => $self->{line_prev},
1437     column => $self->{column_prev} - 2,
1438     };
1439     redo A;
1440     }
1441     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1442     ## ASCII case-insensitive.
1443     if ($self->{nc} == [
1444     undef,
1445     0x004F, # O
1446     0x0043, # C
1447     0x0054, # T
1448     0x0059, # Y
1449     0x0050, # P
1450     ]->[length $self->{s_kwd}] or
1451     $self->{nc} == [
1452     undef,
1453     0x006F, # o
1454     0x0063, # c
1455     0x0074, # t
1456     0x0079, # y
1457     0x0070, # p
1458     ]->[length $self->{s_kwd}]) {
1459     !!!cp (131);
1460     ## Stay in the state.
1461     $self->{s_kwd} .= chr $self->{nc};
1462     !!!next-input-character;
1463     redo A;
1464     } elsif ((length $self->{s_kwd}) == 6 and
1465     ($self->{nc} == 0x0045 or # E
1466     $self->{nc} == 0x0065)) { # e
1467     !!!cp (129);
1468     $self->{state} = DOCTYPE_STATE;
1469     $self->{ct} = {type => DOCTYPE_TOKEN,
1470     quirks => 1,
1471     line => $self->{line_prev},
1472     column => $self->{column_prev} - 7,
1473     };
1474     !!!next-input-character;
1475     redo A;
1476     } else {
1477     !!!cp (132);
1478     !!!parse-error (type => 'bogus comment',
1479     line => $self->{line_prev},
1480     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1481     $self->{state} = BOGUS_COMMENT_STATE;
1482     ## Reconsume.
1483     $self->{ct} = {type => COMMENT_TOKEN,
1484     data => $self->{s_kwd},
1485     line => $self->{line_prev},
1486     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1487     };
1488     redo A;
1489     }
1490     } elsif ($self->{state} == MD_CDATA_STATE) {
1491     if ($self->{nc} == {
1492     '[' => 0x0043, # C
1493     '[C' => 0x0044, # D
1494     '[CD' => 0x0041, # A
1495     '[CDA' => 0x0054, # T
1496     '[CDAT' => 0x0041, # A
1497     }->{$self->{s_kwd}}) {
1498     !!!cp (135.1);
1499     ## Stay in the state.
1500     $self->{s_kwd} .= chr $self->{nc};
1501     !!!next-input-character;
1502     redo A;
1503     } elsif ($self->{s_kwd} eq '[CDATA' and
1504     $self->{nc} == 0x005B) { # [
1505     !!!cp (135.2);
1506     $self->{ct} = {type => CHARACTER_TOKEN,
1507     data => '',
1508     line => $self->{line_prev},
1509     column => $self->{column_prev} - 7};
1510     $self->{state} = CDATA_SECTION_STATE;
1511     !!!next-input-character;
1512     redo A;
1513     } else {
1514     !!!cp (135.3);
1515     !!!parse-error (type => 'bogus comment',
1516     line => $self->{line_prev},
1517     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1518     $self->{state} = BOGUS_COMMENT_STATE;
1519     ## Reconsume.
1520     $self->{ct} = {type => COMMENT_TOKEN,
1521     data => $self->{s_kwd},
1522     line => $self->{line_prev},
1523     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1524     };
1525     redo A;
1526     }
1527     } elsif ($self->{state} == COMMENT_START_STATE) {
1528     if ($self->{nc} == 0x002D) { # -
1529     !!!cp (137);
1530     $self->{state} = COMMENT_START_DASH_STATE;
1531     !!!next-input-character;
1532     redo A;
1533     } elsif ($self->{nc} == 0x003E) { # >
1534     !!!cp (138);
1535     !!!parse-error (type => 'bogus comment');
1536     $self->{state} = DATA_STATE;
1537     !!!next-input-character;
1538    
1539     !!!emit ($self->{ct}); # comment
1540    
1541     redo A;
1542     } elsif ($self->{nc} == -1) {
1543     !!!cp (139);
1544     !!!parse-error (type => 'unclosed comment');
1545     $self->{state} = DATA_STATE;
1546     ## reconsume
1547    
1548     !!!emit ($self->{ct}); # comment
1549    
1550     redo A;
1551     } else {
1552     !!!cp (140);
1553     $self->{ct}->{data} # comment
1554     .= chr ($self->{nc});
1555     $self->{state} = COMMENT_STATE;
1556     !!!next-input-character;
1557     redo A;
1558     }
1559     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1560     if ($self->{nc} == 0x002D) { # -
1561     !!!cp (141);
1562     $self->{state} = COMMENT_END_STATE;
1563     !!!next-input-character;
1564     redo A;
1565     } elsif ($self->{nc} == 0x003E) { # >
1566     !!!cp (142);
1567     !!!parse-error (type => 'bogus comment');
1568     $self->{state} = DATA_STATE;
1569     !!!next-input-character;
1570    
1571     !!!emit ($self->{ct}); # comment
1572    
1573     redo A;
1574     } elsif ($self->{nc} == -1) {
1575     !!!cp (143);
1576     !!!parse-error (type => 'unclosed comment');
1577     $self->{state} = DATA_STATE;
1578     ## reconsume
1579    
1580     !!!emit ($self->{ct}); # comment
1581    
1582     redo A;
1583     } else {
1584     !!!cp (144);
1585     $self->{ct}->{data} # comment
1586     .= '-' . chr ($self->{nc});
1587     $self->{state} = COMMENT_STATE;
1588     !!!next-input-character;
1589     redo A;
1590     }
1591     } elsif ($self->{state} == COMMENT_STATE) {
1592     if ($self->{nc} == 0x002D) { # -
1593     !!!cp (145);
1594     $self->{state} = COMMENT_END_DASH_STATE;
1595     !!!next-input-character;
1596     redo A;
1597     } elsif ($self->{nc} == -1) {
1598     !!!cp (146);
1599     !!!parse-error (type => 'unclosed comment');
1600     $self->{state} = DATA_STATE;
1601     ## reconsume
1602    
1603     !!!emit ($self->{ct}); # comment
1604    
1605     redo A;
1606     } else {
1607     !!!cp (147);
1608     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1609     $self->{read_until}->($self->{ct}->{data},
1610     q[-],
1611     length $self->{ct}->{data});
1612    
1613     ## Stay in the state
1614     !!!next-input-character;
1615     redo A;
1616     }
1617     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1618     if ($self->{nc} == 0x002D) { # -
1619     !!!cp (148);
1620     $self->{state} = COMMENT_END_STATE;
1621     !!!next-input-character;
1622     redo A;
1623     } elsif ($self->{nc} == -1) {
1624     !!!cp (149);
1625     !!!parse-error (type => 'unclosed comment');
1626     $self->{state} = DATA_STATE;
1627     ## reconsume
1628    
1629     !!!emit ($self->{ct}); # comment
1630    
1631     redo A;
1632     } else {
1633     !!!cp (150);
1634     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1635     $self->{state} = COMMENT_STATE;
1636     !!!next-input-character;
1637     redo A;
1638     }
1639     } elsif ($self->{state} == COMMENT_END_STATE) {
1640     if ($self->{nc} == 0x003E) { # >
1641     !!!cp (151);
1642     $self->{state} = DATA_STATE;
1643     !!!next-input-character;
1644    
1645     !!!emit ($self->{ct}); # comment
1646    
1647     redo A;
1648     } elsif ($self->{nc} == 0x002D) { # -
1649     !!!cp (152);
1650     !!!parse-error (type => 'dash in comment',
1651     line => $self->{line_prev},
1652     column => $self->{column_prev});
1653     $self->{ct}->{data} .= '-'; # comment
1654     ## Stay in the state
1655     !!!next-input-character;
1656     redo A;
1657     } elsif ($self->{nc} == -1) {
1658     !!!cp (153);
1659     !!!parse-error (type => 'unclosed comment');
1660     $self->{state} = DATA_STATE;
1661     ## reconsume
1662    
1663     !!!emit ($self->{ct}); # comment
1664    
1665     redo A;
1666     } else {
1667     !!!cp (154);
1668     !!!parse-error (type => 'dash in comment',
1669     line => $self->{line_prev},
1670     column => $self->{column_prev});
1671     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1672     $self->{state} = COMMENT_STATE;
1673     !!!next-input-character;
1674     redo A;
1675     }
1676     } elsif ($self->{state} == DOCTYPE_STATE) {
1677     if ($is_space->{$self->{nc}}) {
1678     !!!cp (155);
1679     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1680     !!!next-input-character;
1681     redo A;
1682     } else {
1683     !!!cp (156);
1684     !!!parse-error (type => 'no space before DOCTYPE name');
1685     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1686     ## reconsume
1687     redo A;
1688     }
1689     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1690     if ($is_space->{$self->{nc}}) {
1691     !!!cp (157);
1692     ## Stay in the state
1693     !!!next-input-character;
1694     redo A;
1695     } elsif ($self->{nc} == 0x003E) { # >
1696     !!!cp (158);
1697     !!!parse-error (type => 'no DOCTYPE name');
1698     $self->{state} = DATA_STATE;
1699     !!!next-input-character;
1700    
1701     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1702    
1703     redo A;
1704     } elsif ($self->{nc} == -1) {
1705     !!!cp (159);
1706     !!!parse-error (type => 'no DOCTYPE name');
1707     $self->{state} = DATA_STATE;
1708     ## reconsume
1709    
1710     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1711    
1712     redo A;
1713     } else {
1714     !!!cp (160);
1715     $self->{ct}->{name} = chr $self->{nc};
1716     delete $self->{ct}->{quirks};
1717     $self->{state} = DOCTYPE_NAME_STATE;
1718     !!!next-input-character;
1719     redo A;
1720     }
1721     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1722     ## ISSUE: Redundant "First," in the spec.
1723     if ($is_space->{$self->{nc}}) {
1724     !!!cp (161);
1725     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1726     !!!next-input-character;
1727     redo A;
1728     } elsif ($self->{nc} == 0x003E) { # >
1729     !!!cp (162);
1730     $self->{state} = DATA_STATE;
1731     !!!next-input-character;
1732    
1733     !!!emit ($self->{ct}); # DOCTYPE
1734    
1735     redo A;
1736     } elsif ($self->{nc} == -1) {
1737     !!!cp (163);
1738     !!!parse-error (type => 'unclosed DOCTYPE');
1739     $self->{state} = DATA_STATE;
1740     ## reconsume
1741    
1742     $self->{ct}->{quirks} = 1;
1743     !!!emit ($self->{ct}); # DOCTYPE
1744    
1745     redo A;
1746     } else {
1747     !!!cp (164);
1748     $self->{ct}->{name}
1749     .= chr ($self->{nc}); # DOCTYPE
1750     ## Stay in the state
1751     !!!next-input-character;
1752     redo A;
1753     }
1754     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1755     if ($is_space->{$self->{nc}}) {
1756     !!!cp (165);
1757     ## Stay in the state
1758     !!!next-input-character;
1759     redo A;
1760     } elsif ($self->{nc} == 0x003E) { # >
1761     !!!cp (166);
1762     $self->{state} = DATA_STATE;
1763     !!!next-input-character;
1764    
1765     !!!emit ($self->{ct}); # DOCTYPE
1766    
1767     redo A;
1768     } elsif ($self->{nc} == -1) {
1769     !!!cp (167);
1770     !!!parse-error (type => 'unclosed DOCTYPE');
1771     $self->{state} = DATA_STATE;
1772     ## reconsume
1773    
1774     $self->{ct}->{quirks} = 1;
1775     !!!emit ($self->{ct}); # DOCTYPE
1776    
1777     redo A;
1778     } elsif ($self->{nc} == 0x0050 or # P
1779     $self->{nc} == 0x0070) { # p
1780     $self->{state} = PUBLIC_STATE;
1781     $self->{s_kwd} = chr $self->{nc};
1782     !!!next-input-character;
1783     redo A;
1784     } elsif ($self->{nc} == 0x0053 or # S
1785     $self->{nc} == 0x0073) { # s
1786     $self->{state} = SYSTEM_STATE;
1787     $self->{s_kwd} = chr $self->{nc};
1788     !!!next-input-character;
1789     redo A;
1790     } else {
1791     !!!cp (180);
1792     !!!parse-error (type => 'string after DOCTYPE name');
1793     $self->{ct}->{quirks} = 1;
1794    
1795     $self->{state} = BOGUS_DOCTYPE_STATE;
1796     !!!next-input-character;
1797     redo A;
1798     }
1799     } elsif ($self->{state} == PUBLIC_STATE) {
1800     ## ASCII case-insensitive
1801     if ($self->{nc} == [
1802     undef,
1803     0x0055, # U
1804     0x0042, # B
1805     0x004C, # L
1806     0x0049, # I
1807     ]->[length $self->{s_kwd}] or
1808     $self->{nc} == [
1809     undef,
1810     0x0075, # u
1811     0x0062, # b
1812     0x006C, # l
1813     0x0069, # i
1814     ]->[length $self->{s_kwd}]) {
1815     !!!cp (175);
1816     ## Stay in the state.
1817     $self->{s_kwd} .= chr $self->{nc};
1818     !!!next-input-character;
1819     redo A;
1820     } elsif ((length $self->{s_kwd}) == 5 and
1821     ($self->{nc} == 0x0043 or # C
1822     $self->{nc} == 0x0063)) { # c
1823     !!!cp (168);
1824     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1825     !!!next-input-character;
1826     redo A;
1827     } else {
1828     !!!cp (169);
1829     !!!parse-error (type => 'string after DOCTYPE name',
1830     line => $self->{line_prev},
1831     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1832     $self->{ct}->{quirks} = 1;
1833    
1834     $self->{state} = BOGUS_DOCTYPE_STATE;
1835     ## Reconsume.
1836     redo A;
1837     }
1838     } elsif ($self->{state} == SYSTEM_STATE) {
1839     ## ASCII case-insensitive
1840     if ($self->{nc} == [
1841     undef,
1842     0x0059, # Y
1843     0x0053, # S
1844     0x0054, # T
1845     0x0045, # E
1846     ]->[length $self->{s_kwd}] or
1847     $self->{nc} == [
1848     undef,
1849     0x0079, # y
1850     0x0073, # s
1851     0x0074, # t
1852     0x0065, # e
1853     ]->[length $self->{s_kwd}]) {
1854     !!!cp (170);
1855     ## Stay in the state.
1856     $self->{s_kwd} .= chr $self->{nc};
1857     !!!next-input-character;
1858     redo A;
1859     } elsif ((length $self->{s_kwd}) == 5 and
1860     ($self->{nc} == 0x004D or # M
1861     $self->{nc} == 0x006D)) { # m
1862     !!!cp (171);
1863     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1864     !!!next-input-character;
1865     redo A;
1866     } else {
1867     !!!cp (172);
1868     !!!parse-error (type => 'string after DOCTYPE name',
1869     line => $self->{line_prev},
1870     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1871     $self->{ct}->{quirks} = 1;
1872    
1873     $self->{state} = BOGUS_DOCTYPE_STATE;
1874     ## Reconsume.
1875     redo A;
1876     }
1877     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1878     if ($is_space->{$self->{nc}}) {
1879     !!!cp (181);
1880     ## Stay in the state
1881     !!!next-input-character;
1882     redo A;
1883     } elsif ($self->{nc} eq 0x0022) { # "
1884     !!!cp (182);
1885     $self->{ct}->{pubid} = ''; # DOCTYPE
1886     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1887     !!!next-input-character;
1888     redo A;
1889     } elsif ($self->{nc} eq 0x0027) { # '
1890     !!!cp (183);
1891     $self->{ct}->{pubid} = ''; # DOCTYPE
1892     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1893     !!!next-input-character;
1894     redo A;
1895     } elsif ($self->{nc} eq 0x003E) { # >
1896     !!!cp (184);
1897     !!!parse-error (type => 'no PUBLIC literal');
1898    
1899     $self->{state} = DATA_STATE;
1900     !!!next-input-character;
1901    
1902     $self->{ct}->{quirks} = 1;
1903     !!!emit ($self->{ct}); # DOCTYPE
1904    
1905     redo A;
1906     } elsif ($self->{nc} == -1) {
1907     !!!cp (185);
1908     !!!parse-error (type => 'unclosed DOCTYPE');
1909    
1910     $self->{state} = DATA_STATE;
1911     ## reconsume
1912    
1913     $self->{ct}->{quirks} = 1;
1914     !!!emit ($self->{ct}); # DOCTYPE
1915    
1916     redo A;
1917     } else {
1918     !!!cp (186);
1919     !!!parse-error (type => 'string after PUBLIC');
1920     $self->{ct}->{quirks} = 1;
1921    
1922     $self->{state} = BOGUS_DOCTYPE_STATE;
1923     !!!next-input-character;
1924     redo A;
1925     }
1926     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1927     if ($self->{nc} == 0x0022) { # "
1928     !!!cp (187);
1929     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1930     !!!next-input-character;
1931     redo A;
1932     } elsif ($self->{nc} == 0x003E) { # >
1933     !!!cp (188);
1934     !!!parse-error (type => 'unclosed PUBLIC literal');
1935    
1936     $self->{state} = DATA_STATE;
1937     !!!next-input-character;
1938    
1939     $self->{ct}->{quirks} = 1;
1940     !!!emit ($self->{ct}); # DOCTYPE
1941    
1942     redo A;
1943     } elsif ($self->{nc} == -1) {
1944     !!!cp (189);
1945     !!!parse-error (type => 'unclosed PUBLIC literal');
1946    
1947     $self->{state} = DATA_STATE;
1948     ## reconsume
1949    
1950     $self->{ct}->{quirks} = 1;
1951     !!!emit ($self->{ct}); # DOCTYPE
1952    
1953     redo A;
1954     } else {
1955     !!!cp (190);
1956     $self->{ct}->{pubid} # DOCTYPE
1957     .= chr $self->{nc};
1958     $self->{read_until}->($self->{ct}->{pubid}, q[">],
1959     length $self->{ct}->{pubid});
1960    
1961     ## Stay in the state
1962     !!!next-input-character;
1963     redo A;
1964     }
1965     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1966     if ($self->{nc} == 0x0027) { # '
1967     !!!cp (191);
1968     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1969     !!!next-input-character;
1970     redo A;
1971     } elsif ($self->{nc} == 0x003E) { # >
1972     !!!cp (192);
1973     !!!parse-error (type => 'unclosed PUBLIC literal');
1974    
1975     $self->{state} = DATA_STATE;
1976     !!!next-input-character;
1977    
1978     $self->{ct}->{quirks} = 1;
1979     !!!emit ($self->{ct}); # DOCTYPE
1980    
1981     redo A;
1982     } elsif ($self->{nc} == -1) {
1983     !!!cp (193);
1984     !!!parse-error (type => 'unclosed PUBLIC literal');
1985    
1986     $self->{state} = DATA_STATE;
1987     ## reconsume
1988    
1989     $self->{ct}->{quirks} = 1;
1990     !!!emit ($self->{ct}); # DOCTYPE
1991    
1992     redo A;
1993     } else {
1994     !!!cp (194);
1995     $self->{ct}->{pubid} # DOCTYPE
1996     .= chr $self->{nc};
1997     $self->{read_until}->($self->{ct}->{pubid}, q['>],
1998     length $self->{ct}->{pubid});
1999    
2000     ## Stay in the state
2001     !!!next-input-character;
2002     redo A;
2003     }
2004     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2005     if ($is_space->{$self->{nc}}) {
2006     !!!cp (195);
2007     ## Stay in the state
2008     !!!next-input-character;
2009     redo A;
2010     } elsif ($self->{nc} == 0x0022) { # "
2011     !!!cp (196);
2012     $self->{ct}->{sysid} = ''; # DOCTYPE
2013     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2014     !!!next-input-character;
2015     redo A;
2016     } elsif ($self->{nc} == 0x0027) { # '
2017     !!!cp (197);
2018     $self->{ct}->{sysid} = ''; # DOCTYPE
2019     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2020     !!!next-input-character;
2021     redo A;
2022     } elsif ($self->{nc} == 0x003E) { # >
2023     !!!cp (198);
2024     $self->{state} = DATA_STATE;
2025     !!!next-input-character;
2026    
2027     !!!emit ($self->{ct}); # DOCTYPE
2028    
2029     redo A;
2030     } elsif ($self->{nc} == -1) {
2031     !!!cp (199);
2032     !!!parse-error (type => 'unclosed DOCTYPE');
2033    
2034     $self->{state} = DATA_STATE;
2035     ## reconsume
2036    
2037     $self->{ct}->{quirks} = 1;
2038     !!!emit ($self->{ct}); # DOCTYPE
2039    
2040     redo A;
2041     } else {
2042     !!!cp (200);
2043     !!!parse-error (type => 'string after PUBLIC literal');
2044     $self->{ct}->{quirks} = 1;
2045    
2046     $self->{state} = BOGUS_DOCTYPE_STATE;
2047     !!!next-input-character;
2048     redo A;
2049     }
2050     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2051     if ($is_space->{$self->{nc}}) {
2052     !!!cp (201);
2053     ## Stay in the state
2054     !!!next-input-character;
2055     redo A;
2056     } elsif ($self->{nc} == 0x0022) { # "
2057     !!!cp (202);
2058     $self->{ct}->{sysid} = ''; # DOCTYPE
2059     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2060     !!!next-input-character;
2061     redo A;
2062     } elsif ($self->{nc} == 0x0027) { # '
2063     !!!cp (203);
2064     $self->{ct}->{sysid} = ''; # DOCTYPE
2065     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2066     !!!next-input-character;
2067     redo A;
2068     } elsif ($self->{nc} == 0x003E) { # >
2069     !!!cp (204);
2070     !!!parse-error (type => 'no SYSTEM literal');
2071     $self->{state} = DATA_STATE;
2072     !!!next-input-character;
2073    
2074     $self->{ct}->{quirks} = 1;
2075     !!!emit ($self->{ct}); # DOCTYPE
2076    
2077     redo A;
2078     } elsif ($self->{nc} == -1) {
2079     !!!cp (205);
2080     !!!parse-error (type => 'unclosed DOCTYPE');
2081    
2082     $self->{state} = DATA_STATE;
2083     ## reconsume
2084    
2085     $self->{ct}->{quirks} = 1;
2086     !!!emit ($self->{ct}); # DOCTYPE
2087    
2088     redo A;
2089     } else {
2090     !!!cp (206);
2091     !!!parse-error (type => 'string after SYSTEM');
2092     $self->{ct}->{quirks} = 1;
2093    
2094     $self->{state} = BOGUS_DOCTYPE_STATE;
2095     !!!next-input-character;
2096     redo A;
2097     }
2098     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2099     if ($self->{nc} == 0x0022) { # "
2100     !!!cp (207);
2101     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2102     !!!next-input-character;
2103     redo A;
2104     } elsif ($self->{nc} == 0x003E) { # >
2105     !!!cp (208);
2106     !!!parse-error (type => 'unclosed SYSTEM literal');
2107    
2108     $self->{state} = DATA_STATE;
2109     !!!next-input-character;
2110    
2111     $self->{ct}->{quirks} = 1;
2112     !!!emit ($self->{ct}); # DOCTYPE
2113    
2114     redo A;
2115     } elsif ($self->{nc} == -1) {
2116     !!!cp (209);
2117     !!!parse-error (type => 'unclosed SYSTEM literal');
2118    
2119     $self->{state} = DATA_STATE;
2120     ## reconsume
2121    
2122     $self->{ct}->{quirks} = 1;
2123     !!!emit ($self->{ct}); # DOCTYPE
2124    
2125     redo A;
2126     } else {
2127     !!!cp (210);
2128     $self->{ct}->{sysid} # DOCTYPE
2129     .= chr $self->{nc};
2130     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2131     length $self->{ct}->{sysid});
2132    
2133     ## Stay in the state
2134     !!!next-input-character;
2135     redo A;
2136     }
2137     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2138     if ($self->{nc} == 0x0027) { # '
2139     !!!cp (211);
2140     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2141     !!!next-input-character;
2142     redo A;
2143     } elsif ($self->{nc} == 0x003E) { # >
2144     !!!cp (212);
2145     !!!parse-error (type => 'unclosed SYSTEM literal');
2146    
2147     $self->{state} = DATA_STATE;
2148     !!!next-input-character;
2149    
2150     $self->{ct}->{quirks} = 1;
2151     !!!emit ($self->{ct}); # DOCTYPE
2152    
2153     redo A;
2154     } elsif ($self->{nc} == -1) {
2155     !!!cp (213);
2156     !!!parse-error (type => 'unclosed SYSTEM literal');
2157    
2158     $self->{state} = DATA_STATE;
2159     ## reconsume
2160    
2161     $self->{ct}->{quirks} = 1;
2162     !!!emit ($self->{ct}); # DOCTYPE
2163    
2164     redo A;
2165     } else {
2166     !!!cp (214);
2167     $self->{ct}->{sysid} # DOCTYPE
2168     .= chr $self->{nc};
2169     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2170     length $self->{ct}->{sysid});
2171    
2172     ## Stay in the state
2173     !!!next-input-character;
2174     redo A;
2175     }
2176     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2177     if ($is_space->{$self->{nc}}) {
2178     !!!cp (215);
2179     ## Stay in the state
2180     !!!next-input-character;
2181     redo A;
2182     } elsif ($self->{nc} == 0x003E) { # >
2183     !!!cp (216);
2184     $self->{state} = DATA_STATE;
2185     !!!next-input-character;
2186    
2187     !!!emit ($self->{ct}); # DOCTYPE
2188    
2189     redo A;
2190     } elsif ($self->{nc} == -1) {
2191     !!!cp (217);
2192     !!!parse-error (type => 'unclosed DOCTYPE');
2193     $self->{state} = DATA_STATE;
2194     ## reconsume
2195    
2196     $self->{ct}->{quirks} = 1;
2197     !!!emit ($self->{ct}); # DOCTYPE
2198    
2199     redo A;
2200     } else {
2201     !!!cp (218);
2202     !!!parse-error (type => 'string after SYSTEM literal');
2203     #$self->{ct}->{quirks} = 1;
2204    
2205     $self->{state} = BOGUS_DOCTYPE_STATE;
2206     !!!next-input-character;
2207     redo A;
2208     }
2209     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2210     if ($self->{nc} == 0x003E) { # >
2211     !!!cp (219);
2212     $self->{state} = DATA_STATE;
2213     !!!next-input-character;
2214    
2215     !!!emit ($self->{ct}); # DOCTYPE
2216    
2217     redo A;
2218     } elsif ($self->{nc} == -1) {
2219     !!!cp (220);
2220     $self->{state} = DATA_STATE;
2221     ## reconsume
2222    
2223     !!!emit ($self->{ct}); # DOCTYPE
2224    
2225     redo A;
2226     } else {
2227     !!!cp (221);
2228     my $s = '';
2229     $self->{read_until}->($s, q[>], 0);
2230    
2231     ## Stay in the state
2232     !!!next-input-character;
2233     redo A;
2234     }
2235     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2236     ## NOTE: "CDATA section state" in the state is jointly implemented
2237     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2238     ## and |CDATA_SECTION_MSE2_STATE|.
2239    
2240     if ($self->{nc} == 0x005D) { # ]
2241     !!!cp (221.1);
2242     $self->{state} = CDATA_SECTION_MSE1_STATE;
2243     !!!next-input-character;
2244     redo A;
2245     } elsif ($self->{nc} == -1) {
2246     $self->{state} = DATA_STATE;
2247     !!!next-input-character;
2248     if (length $self->{ct}->{data}) { # character
2249     !!!cp (221.2);
2250     !!!emit ($self->{ct}); # character
2251     } else {
2252     !!!cp (221.3);
2253     ## No token to emit. $self->{ct} is discarded.
2254     }
2255     redo A;
2256     } else {
2257     !!!cp (221.4);
2258     $self->{ct}->{data} .= chr $self->{nc};
2259     $self->{read_until}->($self->{ct}->{data},
2260     q<]>,
2261     length $self->{ct}->{data});
2262    
2263     ## Stay in the state.
2264     !!!next-input-character;
2265     redo A;
2266     }
2267    
2268     ## ISSUE: "text tokens" in spec.
2269     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2270     if ($self->{nc} == 0x005D) { # ]
2271     !!!cp (221.5);
2272     $self->{state} = CDATA_SECTION_MSE2_STATE;
2273     !!!next-input-character;
2274     redo A;
2275     } else {
2276     !!!cp (221.6);
2277     $self->{ct}->{data} .= ']';
2278     $self->{state} = CDATA_SECTION_STATE;
2279     ## Reconsume.
2280     redo A;
2281     }
2282     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2283     if ($self->{nc} == 0x003E) { # >
2284     $self->{state} = DATA_STATE;
2285     !!!next-input-character;
2286     if (length $self->{ct}->{data}) { # character
2287     !!!cp (221.7);
2288     !!!emit ($self->{ct}); # character
2289     } else {
2290     !!!cp (221.8);
2291     ## No token to emit. $self->{ct} is discarded.
2292     }
2293     redo A;
2294     } elsif ($self->{nc} == 0x005D) { # ]
2295     !!!cp (221.9); # character
2296     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2297     ## Stay in the state.
2298     !!!next-input-character;
2299     redo A;
2300     } else {
2301     !!!cp (221.11);
2302     $self->{ct}->{data} .= ']]'; # character
2303     $self->{state} = CDATA_SECTION_STATE;
2304     ## Reconsume.
2305     redo A;
2306     }
2307     } elsif ($self->{state} == ENTITY_STATE) {
2308     if ($is_space->{$self->{nc}} or
2309     {
2310     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2311     $self->{entity_add} => 1,
2312     }->{$self->{nc}}) {
2313     !!!cp (1001);
2314     ## Don't consume
2315     ## No error
2316     ## Return nothing.
2317     #
2318     } elsif ($self->{nc} == 0x0023) { # #
2319     !!!cp (999);
2320     $self->{state} = ENTITY_HASH_STATE;
2321     $self->{s_kwd} = '#';
2322     !!!next-input-character;
2323     redo A;
2324     } elsif ((0x0041 <= $self->{nc} and
2325     $self->{nc} <= 0x005A) or # A..Z
2326     (0x0061 <= $self->{nc} and
2327     $self->{nc} <= 0x007A)) { # a..z
2328     !!!cp (998);
2329     require Whatpm::_NamedEntityList;
2330     $self->{state} = ENTITY_NAME_STATE;
2331     $self->{s_kwd} = chr $self->{nc};
2332     $self->{entity__value} = $self->{s_kwd};
2333     $self->{entity__match} = 0;
2334     !!!next-input-character;
2335     redo A;
2336     } else {
2337     !!!cp (1027);
2338     !!!parse-error (type => 'bare ero');
2339     ## Return nothing.
2340     #
2341     }
2342    
2343     ## NOTE: No character is consumed by the "consume a character
2344     ## reference" algorithm. In other word, there is an "&" character
2345     ## that does not introduce a character reference, which would be
2346     ## appended to the parent element or the attribute value in later
2347     ## process of the tokenizer.
2348    
2349     if ($self->{prev_state} == DATA_STATE) {
2350     !!!cp (997);
2351     $self->{state} = $self->{prev_state};
2352     ## Reconsume.
2353     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2354     line => $self->{line_prev},
2355     column => $self->{column_prev},
2356     });
2357     redo A;
2358     } else {
2359     !!!cp (996);
2360     $self->{ca}->{value} .= '&';
2361     $self->{state} = $self->{prev_state};
2362     ## Reconsume.
2363     redo A;
2364     }
2365     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2366     if ($self->{nc} == 0x0078 or # x
2367     $self->{nc} == 0x0058) { # X
2368     !!!cp (995);
2369     $self->{state} = HEXREF_X_STATE;
2370     $self->{s_kwd} .= chr $self->{nc};
2371     !!!next-input-character;
2372     redo A;
2373     } elsif (0x0030 <= $self->{nc} and
2374     $self->{nc} <= 0x0039) { # 0..9
2375     !!!cp (994);
2376     $self->{state} = NCR_NUM_STATE;
2377     $self->{s_kwd} = $self->{nc} - 0x0030;
2378     !!!next-input-character;
2379     redo A;
2380     } else {
2381     !!!parse-error (type => 'bare nero',
2382     line => $self->{line_prev},
2383     column => $self->{column_prev} - 1);
2384    
2385     ## NOTE: According to the spec algorithm, nothing is returned,
2386     ## and then "&#" is appended to the parent element or the attribute
2387     ## value in the later processing.
2388    
2389     if ($self->{prev_state} == DATA_STATE) {
2390     !!!cp (1019);
2391     $self->{state} = $self->{prev_state};
2392     ## Reconsume.
2393     !!!emit ({type => CHARACTER_TOKEN,
2394     data => '&#',
2395     line => $self->{line_prev},
2396     column => $self->{column_prev} - 1,
2397     });
2398     redo A;
2399     } else {
2400     !!!cp (993);
2401     $self->{ca}->{value} .= '&#';
2402     $self->{state} = $self->{prev_state};
2403     ## Reconsume.
2404     redo A;
2405     }
2406     }
2407     } elsif ($self->{state} == NCR_NUM_STATE) {
2408     if (0x0030 <= $self->{nc} and
2409     $self->{nc} <= 0x0039) { # 0..9
2410     !!!cp (1012);
2411     $self->{s_kwd} *= 10;
2412     $self->{s_kwd} += $self->{nc} - 0x0030;
2413    
2414     ## Stay in the state.
2415     !!!next-input-character;
2416     redo A;
2417     } elsif ($self->{nc} == 0x003B) { # ;
2418     !!!cp (1013);
2419     !!!next-input-character;
2420     #
2421     } else {
2422     !!!cp (1014);
2423     !!!parse-error (type => 'no refc');
2424     ## Reconsume.
2425     #
2426     }
2427    
2428     my $code = $self->{s_kwd};
2429     my $l = $self->{line_prev};
2430     my $c = $self->{column_prev};
2431     if ($charref_map->{$code}) {
2432     !!!cp (1015);
2433     !!!parse-error (type => 'invalid character reference',
2434     text => (sprintf 'U+%04X', $code),
2435     line => $l, column => $c);
2436     $code = $charref_map->{$code};
2437     } elsif ($code > 0x10FFFF) {
2438     !!!cp (1016);
2439     !!!parse-error (type => 'invalid character reference',
2440     text => (sprintf 'U-%08X', $code),
2441     line => $l, column => $c);
2442     $code = 0xFFFD;
2443     }
2444    
2445     if ($self->{prev_state} == DATA_STATE) {
2446     !!!cp (992);
2447     $self->{state} = $self->{prev_state};
2448     ## Reconsume.
2449     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2450     line => $l, column => $c,
2451     });
2452     redo A;
2453     } else {
2454     !!!cp (991);
2455     $self->{ca}->{value} .= chr $code;
2456     $self->{ca}->{has_reference} = 1;
2457     $self->{state} = $self->{prev_state};
2458     ## Reconsume.
2459     redo A;
2460     }
2461     } elsif ($self->{state} == HEXREF_X_STATE) {
2462     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2463     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2464     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2465     # 0..9, A..F, a..f
2466     !!!cp (990);
2467     $self->{state} = HEXREF_HEX_STATE;
2468     $self->{s_kwd} = 0;
2469     ## Reconsume.
2470     redo A;
2471     } else {
2472     !!!parse-error (type => 'bare hcro',
2473     line => $self->{line_prev},
2474     column => $self->{column_prev} - 2);
2475    
2476     ## NOTE: According to the spec algorithm, nothing is returned,
2477     ## and then "&#" followed by "X" or "x" is appended to the parent
2478     ## element or the attribute value in the later processing.
2479    
2480     if ($self->{prev_state} == DATA_STATE) {
2481     !!!cp (1005);
2482     $self->{state} = $self->{prev_state};
2483     ## Reconsume.
2484     !!!emit ({type => CHARACTER_TOKEN,
2485     data => '&' . $self->{s_kwd},
2486     line => $self->{line_prev},
2487     column => $self->{column_prev} - length $self->{s_kwd},
2488     });
2489     redo A;
2490     } else {
2491     !!!cp (989);
2492     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2493     $self->{state} = $self->{prev_state};
2494     ## Reconsume.
2495     redo A;
2496     }
2497     }
2498     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2499     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2500     # 0..9
2501     !!!cp (1002);
2502     $self->{s_kwd} *= 0x10;
2503     $self->{s_kwd} += $self->{nc} - 0x0030;
2504     ## Stay in the state.
2505     !!!next-input-character;
2506     redo A;
2507     } elsif (0x0061 <= $self->{nc} and
2508     $self->{nc} <= 0x0066) { # a..f
2509     !!!cp (1003);
2510     $self->{s_kwd} *= 0x10;
2511     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2512     ## Stay in the state.
2513     !!!next-input-character;
2514     redo A;
2515     } elsif (0x0041 <= $self->{nc} and
2516     $self->{nc} <= 0x0046) { # A..F
2517     !!!cp (1004);
2518     $self->{s_kwd} *= 0x10;
2519     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2520     ## Stay in the state.
2521     !!!next-input-character;
2522     redo A;
2523     } elsif ($self->{nc} == 0x003B) { # ;
2524     !!!cp (1006);
2525     !!!next-input-character;
2526     #
2527     } else {
2528     !!!cp (1007);
2529     !!!parse-error (type => 'no refc',
2530     line => $self->{line},
2531     column => $self->{column});
2532     ## Reconsume.
2533     #
2534     }
2535    
2536     my $code = $self->{s_kwd};
2537     my $l = $self->{line_prev};
2538     my $c = $self->{column_prev};
2539     if ($charref_map->{$code}) {
2540     !!!cp (1008);
2541     !!!parse-error (type => 'invalid character reference',
2542     text => (sprintf 'U+%04X', $code),
2543     line => $l, column => $c);
2544     $code = $charref_map->{$code};
2545     } elsif ($code > 0x10FFFF) {
2546     !!!cp (1009);
2547     !!!parse-error (type => 'invalid character reference',
2548     text => (sprintf 'U-%08X', $code),
2549     line => $l, column => $c);
2550     $code = 0xFFFD;
2551     }
2552    
2553     if ($self->{prev_state} == DATA_STATE) {
2554     !!!cp (988);
2555     $self->{state} = $self->{prev_state};
2556     ## Reconsume.
2557     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2558     line => $l, column => $c,
2559     });
2560     redo A;
2561     } else {
2562     !!!cp (987);
2563     $self->{ca}->{value} .= chr $code;
2564     $self->{ca}->{has_reference} = 1;
2565     $self->{state} = $self->{prev_state};
2566     ## Reconsume.
2567     redo A;
2568     }
2569     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2570     if (length $self->{s_kwd} < 30 and
2571     ## NOTE: Some number greater than the maximum length of entity name
2572     ((0x0041 <= $self->{nc} and # a
2573     $self->{nc} <= 0x005A) or # x
2574     (0x0061 <= $self->{nc} and # a
2575     $self->{nc} <= 0x007A) or # z
2576     (0x0030 <= $self->{nc} and # 0
2577     $self->{nc} <= 0x0039) or # 9
2578     $self->{nc} == 0x003B)) { # ;
2579     our $EntityChar;
2580     $self->{s_kwd} .= chr $self->{nc};
2581     if (defined $EntityChar->{$self->{s_kwd}}) {
2582     if ($self->{nc} == 0x003B) { # ;
2583     !!!cp (1020);
2584     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2585     $self->{entity__match} = 1;
2586     !!!next-input-character;
2587     #
2588     } else {
2589     !!!cp (1021);
2590     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2591     $self->{entity__match} = -1;
2592     ## Stay in the state.
2593     !!!next-input-character;
2594     redo A;
2595     }
2596     } else {
2597     !!!cp (1022);
2598     $self->{entity__value} .= chr $self->{nc};
2599     $self->{entity__match} *= 2;
2600     ## Stay in the state.
2601     !!!next-input-character;
2602     redo A;
2603     }
2604     }
2605    
2606     my $data;
2607     my $has_ref;
2608     if ($self->{entity__match} > 0) {
2609     !!!cp (1023);
2610     $data = $self->{entity__value};
2611     $has_ref = 1;
2612     #
2613     } elsif ($self->{entity__match} < 0) {
2614     !!!parse-error (type => 'no refc');
2615     if ($self->{prev_state} != DATA_STATE and # in attribute
2616     $self->{entity__match} < -1) {
2617     !!!cp (1024);
2618     $data = '&' . $self->{s_kwd};
2619     #
2620     } else {
2621     !!!cp (1025);
2622     $data = $self->{entity__value};
2623     $has_ref = 1;
2624     #
2625     }
2626     } else {
2627     !!!cp (1026);
2628     !!!parse-error (type => 'bare ero',
2629     line => $self->{line_prev},
2630     column => $self->{column_prev} - length $self->{s_kwd});
2631     $data = '&' . $self->{s_kwd};
2632     #
2633     }
2634    
2635     ## NOTE: In these cases, when a character reference is found,
2636     ## it is consumed and a character token is returned, or, otherwise,
2637     ## nothing is consumed and returned, according to the spec algorithm.
2638     ## In this implementation, anything that has been examined by the
2639     ## tokenizer is appended to the parent element or the attribute value
2640     ## as string, either literal string when no character reference or
2641     ## entity-replaced string otherwise, in this stage, since any characters
2642     ## that would not be consumed are appended in the data state or in an
2643     ## appropriate attribute value state anyway.
2644    
2645     if ($self->{prev_state} == DATA_STATE) {
2646     !!!cp (986);
2647     $self->{state} = $self->{prev_state};
2648     ## Reconsume.
2649     !!!emit ({type => CHARACTER_TOKEN,
2650     data => $data,
2651     line => $self->{line_prev},
2652     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2653     });
2654     redo A;
2655     } else {
2656     !!!cp (985);
2657     $self->{ca}->{value} .= $data;
2658     $self->{ca}->{has_reference} = 1 if $has_ref;
2659     $self->{state} = $self->{prev_state};
2660     ## Reconsume.
2661     redo A;
2662     }
2663     } else {
2664     die "$0: $self->{state}: Unknown state";
2665     }
2666     } # A
2667    
2668     die "$0: _get_next_token: unexpected case";
2669     } # _get_next_token
2670    
2671     1;
2672 wakaba 1.3 ## $Date: 2008/10/14 04:32:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24