/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (hide annotations) (download) (as text)
Tue Oct 14 11:46:57 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.3: +10 -8 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	14 Oct 2008 11:46:38 -0000
	* XML-Parser.t: "xml/elements-1.dat" and "xml/doctypes-1.dat"
	added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	14 Oct 2008 11:46:52 -0000
	* elements-1.dat: New test data file.

	* doctypes-1.dat: New test data file.

	* attrs-1.dat: New test data on attribute name cases are added.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 11:40:58 -0000
	* Tokenizer.pm.src: Support for case-insensitive XML attribute
	names.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.4 our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181     #$self->{s_kwd}; # state keyword - initialized when used
182     #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194     !!!next-input-character;
195     $self->{token} = [];
196     # $self->{escape}
197     } # _initialize_tokenizer
198    
199     ## A token has:
200     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
201     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
202     ## ->{name} (DOCTYPE_TOKEN)
203     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
204     ## ->{pubid} (DOCTYPE_TOKEN)
205     ## ->{sysid} (DOCTYPE_TOKEN)
206     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
207     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
208     ## ->{name}
209     ## ->{value}
210     ## ->{has_reference} == 1 or 0
211     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
213     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
214     ## while the token is pushed back to the stack.
215    
216     ## Emitted token MUST immediately be handled by the tree construction state.
217    
218     ## Before each step, UA MAY check to see if either one of the scripts in
219     ## "list of scripts that will execute as soon as possible" or the first
220     ## script in the "list of scripts that will execute asynchronously",
221     ## has completed loading. If one has, then it MUST be executed
222     ## and removed from the list.
223    
224     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
225     ## (This requirement was dropped from HTML5 spec, unfortunately.)
226    
227     my $is_space = {
228     0x0009 => 1, # CHARACTER TABULATION (HT)
229     0x000A => 1, # LINE FEED (LF)
230     #0x000B => 0, # LINE TABULATION (VT)
231     0x000C => 1, # FORM FEED (FF)
232     #0x000D => 1, # CARRIAGE RETURN (CR)
233     0x0020 => 1, # SPACE (SP)
234     };
235    
236     sub _get_next_token ($) {
237     my $self = shift;
238    
239     if ($self->{self_closing}) {
240     !!!parse-error (type => 'nestc', token => $self->{ct});
241     ## NOTE: The |self_closing| flag is only set by start tag token.
242     ## In addition, when a start tag token is emitted, it is always set to
243     ## |ct|.
244     delete $self->{self_closing};
245     }
246    
247     if (@{$self->{token}}) {
248     $self->{self_closing} = $self->{token}->[0]->{self_closing};
249     return shift @{$self->{token}};
250     }
251    
252     A: {
253     if ($self->{state} == PCDATA_STATE) {
254     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
255    
256     if ($self->{nc} == 0x0026) { # &
257     !!!cp (0.1);
258     ## NOTE: In the spec, the tokenizer is switched to the
259     ## "entity data state". In this implementation, the tokenizer
260     ## is switched to the |ENTITY_STATE|, which is an implementation
261     ## of the "consume a character reference" algorithm.
262     $self->{entity_add} = -1;
263     $self->{prev_state} = DATA_STATE;
264     $self->{state} = ENTITY_STATE;
265     !!!next-input-character;
266     redo A;
267     } elsif ($self->{nc} == 0x003C) { # <
268     !!!cp (0.2);
269     $self->{state} = TAG_OPEN_STATE;
270     !!!next-input-character;
271     redo A;
272     } elsif ($self->{nc} == -1) {
273     !!!cp (0.3);
274     !!!emit ({type => END_OF_FILE_TOKEN,
275     line => $self->{line}, column => $self->{column}});
276     last A; ## TODO: ok?
277     } else {
278     !!!cp (0.4);
279     #
280     }
281    
282     # Anything else
283     my $token = {type => CHARACTER_TOKEN,
284     data => chr $self->{nc},
285     line => $self->{line}, column => $self->{column},
286     };
287     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
288    
289     ## Stay in the state.
290     !!!next-input-character;
291     !!!emit ($token);
292     redo A;
293     } elsif ($self->{state} == DATA_STATE) {
294     $self->{s_kwd} = '' unless defined $self->{s_kwd};
295     if ($self->{nc} == 0x0026) { # &
296     $self->{s_kwd} = '';
297     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
298     not $self->{escape}) {
299     !!!cp (1);
300     ## NOTE: In the spec, the tokenizer is switched to the
301     ## "entity data state". In this implementation, the tokenizer
302     ## is switched to the |ENTITY_STATE|, which is an implementation
303     ## of the "consume a character reference" algorithm.
304     $self->{entity_add} = -1;
305     $self->{prev_state} = DATA_STATE;
306     $self->{state} = ENTITY_STATE;
307     !!!next-input-character;
308     redo A;
309     } else {
310     !!!cp (2);
311     #
312     }
313     } elsif ($self->{nc} == 0x002D) { # -
314     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
315     $self->{s_kwd} .= '-';
316    
317     if ($self->{s_kwd} eq '<!--') {
318     !!!cp (3);
319     $self->{escape} = 1; # unless $self->{escape};
320     $self->{s_kwd} = '--';
321     #
322     } elsif ($self->{s_kwd} eq '---') {
323     !!!cp (4);
324     $self->{s_kwd} = '--';
325     #
326     } else {
327     !!!cp (5);
328     #
329     }
330     }
331    
332     #
333     } elsif ($self->{nc} == 0x0021) { # !
334     if (length $self->{s_kwd}) {
335     !!!cp (5.1);
336     $self->{s_kwd} .= '!';
337     #
338     } else {
339     !!!cp (5.2);
340     #$self->{s_kwd} = '';
341     #
342     }
343     #
344     } elsif ($self->{nc} == 0x003C) { # <
345     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
346     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
347     not $self->{escape})) {
348     !!!cp (6);
349     $self->{state} = TAG_OPEN_STATE;
350     !!!next-input-character;
351     redo A;
352     } else {
353     !!!cp (7);
354     $self->{s_kwd} = '';
355     #
356     }
357     } elsif ($self->{nc} == 0x003E) { # >
358     if ($self->{escape} and
359     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
360     if ($self->{s_kwd} eq '--') {
361     !!!cp (8);
362     delete $self->{escape};
363     } else {
364     !!!cp (9);
365     }
366     } else {
367     !!!cp (10);
368     }
369    
370     $self->{s_kwd} = '';
371     #
372     } elsif ($self->{nc} == -1) {
373     !!!cp (11);
374     $self->{s_kwd} = '';
375     !!!emit ({type => END_OF_FILE_TOKEN,
376     line => $self->{line}, column => $self->{column}});
377     last A; ## TODO: ok?
378     } else {
379     !!!cp (12);
380     $self->{s_kwd} = '';
381     #
382     }
383    
384     # Anything else
385     my $token = {type => CHARACTER_TOKEN,
386     data => chr $self->{nc},
387     line => $self->{line}, column => $self->{column},
388     };
389     if ($self->{read_until}->($token->{data}, q[-!<>&],
390     length $token->{data})) {
391     $self->{s_kwd} = '';
392     }
393    
394     ## Stay in the data state.
395     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
396     !!!cp (13);
397     $self->{state} = PCDATA_STATE;
398     } else {
399     !!!cp (14);
400     ## Stay in the state.
401     }
402     !!!next-input-character;
403     !!!emit ($token);
404     redo A;
405     } elsif ($self->{state} == TAG_OPEN_STATE) {
406     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
407     if ($self->{nc} == 0x002F) { # /
408     !!!cp (15);
409     !!!next-input-character;
410     $self->{state} = CLOSE_TAG_OPEN_STATE;
411     redo A;
412     } elsif ($self->{nc} == 0x0021) { # !
413     !!!cp (15.1);
414     $self->{s_kwd} = '<' unless $self->{escape};
415     #
416     } else {
417     !!!cp (16);
418     #
419     }
420    
421     ## reconsume
422     $self->{state} = DATA_STATE;
423     !!!emit ({type => CHARACTER_TOKEN, data => '<',
424     line => $self->{line_prev},
425     column => $self->{column_prev},
426     });
427     redo A;
428     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
429     if ($self->{nc} == 0x0021) { # !
430     !!!cp (17);
431     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
432     !!!next-input-character;
433     redo A;
434     } elsif ($self->{nc} == 0x002F) { # /
435     !!!cp (18);
436     $self->{state} = CLOSE_TAG_OPEN_STATE;
437     !!!next-input-character;
438     redo A;
439     } elsif (0x0041 <= $self->{nc} and
440     $self->{nc} <= 0x005A) { # A..Z
441     !!!cp (19);
442     $self->{ct}
443     = {type => START_TAG_TOKEN,
444 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
445 wakaba 1.1 line => $self->{line_prev},
446     column => $self->{column_prev}};
447     $self->{state} = TAG_NAME_STATE;
448     !!!next-input-character;
449     redo A;
450     } elsif (0x0061 <= $self->{nc} and
451     $self->{nc} <= 0x007A) { # a..z
452     !!!cp (20);
453     $self->{ct} = {type => START_TAG_TOKEN,
454     tag_name => chr ($self->{nc}),
455     line => $self->{line_prev},
456     column => $self->{column_prev}};
457     $self->{state} = TAG_NAME_STATE;
458     !!!next-input-character;
459     redo A;
460     } elsif ($self->{nc} == 0x003E) { # >
461     !!!cp (21);
462     !!!parse-error (type => 'empty start tag',
463     line => $self->{line_prev},
464     column => $self->{column_prev});
465     $self->{state} = DATA_STATE;
466     !!!next-input-character;
467    
468     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
469     line => $self->{line_prev},
470     column => $self->{column_prev},
471     });
472    
473     redo A;
474     } elsif ($self->{nc} == 0x003F) { # ?
475     !!!cp (22);
476     !!!parse-error (type => 'pio',
477     line => $self->{line_prev},
478     column => $self->{column_prev});
479     $self->{state} = BOGUS_COMMENT_STATE;
480     $self->{ct} = {type => COMMENT_TOKEN, data => '',
481     line => $self->{line_prev},
482     column => $self->{column_prev},
483     };
484     ## $self->{nc} is intentionally left as is
485     redo A;
486     } else {
487     !!!cp (23);
488     !!!parse-error (type => 'bare stago',
489     line => $self->{line_prev},
490     column => $self->{column_prev});
491     $self->{state} = DATA_STATE;
492     ## reconsume
493    
494     !!!emit ({type => CHARACTER_TOKEN, data => '<',
495     line => $self->{line_prev},
496     column => $self->{column_prev},
497     });
498    
499     redo A;
500     }
501     } else {
502     die "$0: $self->{content_model} in tag open";
503     }
504     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
505     ## NOTE: The "close tag open state" in the spec is implemented as
506     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
507    
508     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
509     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
510     if (defined $self->{last_stag_name}) {
511     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
512     $self->{s_kwd} = '';
513     ## Reconsume.
514     redo A;
515     } else {
516     ## No start tag token has ever been emitted
517     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
518     !!!cp (28);
519     $self->{state} = DATA_STATE;
520     ## Reconsume.
521     !!!emit ({type => CHARACTER_TOKEN, data => '</',
522     line => $l, column => $c,
523     });
524     redo A;
525     }
526     }
527    
528     if (0x0041 <= $self->{nc} and
529     $self->{nc} <= 0x005A) { # A..Z
530     !!!cp (29);
531     $self->{ct}
532     = {type => END_TAG_TOKEN,
533 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
534 wakaba 1.1 line => $l, column => $c};
535     $self->{state} = TAG_NAME_STATE;
536     !!!next-input-character;
537     redo A;
538     } elsif (0x0061 <= $self->{nc} and
539     $self->{nc} <= 0x007A) { # a..z
540     !!!cp (30);
541     $self->{ct} = {type => END_TAG_TOKEN,
542     tag_name => chr ($self->{nc}),
543     line => $l, column => $c};
544     $self->{state} = TAG_NAME_STATE;
545     !!!next-input-character;
546     redo A;
547     } elsif ($self->{nc} == 0x003E) { # >
548     !!!cp (31);
549     !!!parse-error (type => 'empty end tag',
550     line => $self->{line_prev}, ## "<" in "</>"
551     column => $self->{column_prev} - 1);
552     $self->{state} = DATA_STATE;
553     !!!next-input-character;
554     redo A;
555     } elsif ($self->{nc} == -1) {
556     !!!cp (32);
557     !!!parse-error (type => 'bare etago');
558     $self->{state} = DATA_STATE;
559     # reconsume
560    
561     !!!emit ({type => CHARACTER_TOKEN, data => '</',
562     line => $l, column => $c,
563     });
564    
565     redo A;
566     } else {
567     !!!cp (33);
568     !!!parse-error (type => 'bogus end tag');
569     $self->{state} = BOGUS_COMMENT_STATE;
570     $self->{ct} = {type => COMMENT_TOKEN, data => '',
571     line => $self->{line_prev}, # "<" of "</"
572     column => $self->{column_prev} - 1,
573     };
574     ## NOTE: $self->{nc} is intentionally left as is.
575     ## Although the "anything else" case of the spec not explicitly
576     ## states that the next input character is to be reconsumed,
577     ## it will be included to the |data| of the comment token
578     ## generated from the bogus end tag, as defined in the
579     ## "bogus comment state" entry.
580     redo A;
581     }
582     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
583     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
584     if (length $ch) {
585     my $CH = $ch;
586     $ch =~ tr/a-z/A-Z/;
587     my $nch = chr $self->{nc};
588     if ($nch eq $ch or $nch eq $CH) {
589     !!!cp (24);
590     ## Stay in the state.
591     $self->{s_kwd} .= $nch;
592     !!!next-input-character;
593     redo A;
594     } else {
595     !!!cp (25);
596     $self->{state} = DATA_STATE;
597     ## Reconsume.
598     !!!emit ({type => CHARACTER_TOKEN,
599     data => '</' . $self->{s_kwd},
600     line => $self->{line_prev},
601     column => $self->{column_prev} - 1 - length $self->{s_kwd},
602     });
603     redo A;
604     }
605     } else { # after "<{tag-name}"
606     unless ($is_space->{$self->{nc}} or
607     {
608     0x003E => 1, # >
609     0x002F => 1, # /
610     -1 => 1, # EOF
611     }->{$self->{nc}}) {
612     !!!cp (26);
613     ## Reconsume.
614     $self->{state} = DATA_STATE;
615     !!!emit ({type => CHARACTER_TOKEN,
616     data => '</' . $self->{s_kwd},
617     line => $self->{line_prev},
618     column => $self->{column_prev} - 1 - length $self->{s_kwd},
619     });
620     redo A;
621     } else {
622     !!!cp (27);
623     $self->{ct}
624     = {type => END_TAG_TOKEN,
625     tag_name => $self->{last_stag_name},
626     line => $self->{line_prev},
627     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
628     $self->{state} = TAG_NAME_STATE;
629     ## Reconsume.
630     redo A;
631     }
632     }
633     } elsif ($self->{state} == TAG_NAME_STATE) {
634     if ($is_space->{$self->{nc}}) {
635     !!!cp (34);
636     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
637     !!!next-input-character;
638     redo A;
639     } elsif ($self->{nc} == 0x003E) { # >
640     if ($self->{ct}->{type} == START_TAG_TOKEN) {
641     !!!cp (35);
642     $self->{last_stag_name} = $self->{ct}->{tag_name};
643     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
644     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645     #if ($self->{ct}->{attributes}) {
646     # ## NOTE: This should never be reached.
647     # !!! cp (36);
648     # !!! parse-error (type => 'end tag attribute');
649     #} else {
650     !!!cp (37);
651     #}
652     } else {
653     die "$0: $self->{ct}->{type}: Unknown token type";
654     }
655     $self->{state} = DATA_STATE;
656     !!!next-input-character;
657    
658     !!!emit ($self->{ct}); # start tag or end tag
659    
660     redo A;
661     } elsif (0x0041 <= $self->{nc} and
662     $self->{nc} <= 0x005A) { # A..Z
663     !!!cp (38);
664 wakaba 1.4 $self->{ct}->{tag_name}
665     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
666 wakaba 1.1 # start tag or end tag
667     ## Stay in this state
668     !!!next-input-character;
669     redo A;
670     } elsif ($self->{nc} == -1) {
671     !!!parse-error (type => 'unclosed tag');
672     if ($self->{ct}->{type} == START_TAG_TOKEN) {
673     !!!cp (39);
674     $self->{last_stag_name} = $self->{ct}->{tag_name};
675     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
676     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
677     #if ($self->{ct}->{attributes}) {
678     # ## NOTE: This state should never be reached.
679     # !!! cp (40);
680     # !!! parse-error (type => 'end tag attribute');
681     #} else {
682     !!!cp (41);
683     #}
684     } else {
685     die "$0: $self->{ct}->{type}: Unknown token type";
686     }
687     $self->{state} = DATA_STATE;
688     # reconsume
689    
690     !!!emit ($self->{ct}); # start tag or end tag
691    
692     redo A;
693     } elsif ($self->{nc} == 0x002F) { # /
694     !!!cp (42);
695     $self->{state} = SELF_CLOSING_START_TAG_STATE;
696     !!!next-input-character;
697     redo A;
698     } else {
699     !!!cp (44);
700     $self->{ct}->{tag_name} .= chr $self->{nc};
701     # start tag or end tag
702     ## Stay in the state
703     !!!next-input-character;
704     redo A;
705     }
706     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
707     if ($is_space->{$self->{nc}}) {
708     !!!cp (45);
709     ## Stay in the state
710     !!!next-input-character;
711     redo A;
712     } elsif ($self->{nc} == 0x003E) { # >
713     if ($self->{ct}->{type} == START_TAG_TOKEN) {
714     !!!cp (46);
715     $self->{last_stag_name} = $self->{ct}->{tag_name};
716     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
717     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
718     if ($self->{ct}->{attributes}) {
719     !!!cp (47);
720     !!!parse-error (type => 'end tag attribute');
721     } else {
722     !!!cp (48);
723     }
724     } else {
725     die "$0: $self->{ct}->{type}: Unknown token type";
726     }
727     $self->{state} = DATA_STATE;
728     !!!next-input-character;
729    
730     !!!emit ($self->{ct}); # start tag or end tag
731    
732     redo A;
733     } elsif (0x0041 <= $self->{nc} and
734     $self->{nc} <= 0x005A) { # A..Z
735     !!!cp (49);
736     $self->{ca}
737 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
738 wakaba 1.1 value => '',
739     line => $self->{line}, column => $self->{column}};
740     $self->{state} = ATTRIBUTE_NAME_STATE;
741     !!!next-input-character;
742     redo A;
743     } elsif ($self->{nc} == 0x002F) { # /
744     !!!cp (50);
745     $self->{state} = SELF_CLOSING_START_TAG_STATE;
746     !!!next-input-character;
747     redo A;
748     } elsif ($self->{nc} == -1) {
749     !!!parse-error (type => 'unclosed tag');
750     if ($self->{ct}->{type} == START_TAG_TOKEN) {
751     !!!cp (52);
752     $self->{last_stag_name} = $self->{ct}->{tag_name};
753     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
754     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
755     if ($self->{ct}->{attributes}) {
756     !!!cp (53);
757     !!!parse-error (type => 'end tag attribute');
758     } else {
759     !!!cp (54);
760     }
761     } else {
762     die "$0: $self->{ct}->{type}: Unknown token type";
763     }
764     $self->{state} = DATA_STATE;
765     # reconsume
766    
767     !!!emit ($self->{ct}); # start tag or end tag
768    
769     redo A;
770     } else {
771     if ({
772     0x0022 => 1, # "
773     0x0027 => 1, # '
774     0x003D => 1, # =
775     }->{$self->{nc}}) {
776     !!!cp (55);
777     !!!parse-error (type => 'bad attribute name');
778     } else {
779     !!!cp (56);
780     }
781     $self->{ca}
782     = {name => chr ($self->{nc}),
783     value => '',
784     line => $self->{line}, column => $self->{column}};
785     $self->{state} = ATTRIBUTE_NAME_STATE;
786     !!!next-input-character;
787     redo A;
788     }
789     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
790     my $before_leave = sub {
791     if (exists $self->{ct}->{attributes} # start tag or end tag
792     ->{$self->{ca}->{name}}) { # MUST
793     !!!cp (57);
794     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
795     ## Discard $self->{ca} # MUST
796     } else {
797     !!!cp (58);
798     $self->{ct}->{attributes}->{$self->{ca}->{name}}
799     = $self->{ca};
800     }
801     }; # $before_leave
802    
803     if ($is_space->{$self->{nc}}) {
804     !!!cp (59);
805     $before_leave->();
806     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
807     !!!next-input-character;
808     redo A;
809     } elsif ($self->{nc} == 0x003D) { # =
810     !!!cp (60);
811     $before_leave->();
812     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
813     !!!next-input-character;
814     redo A;
815     } elsif ($self->{nc} == 0x003E) { # >
816     $before_leave->();
817     if ($self->{ct}->{type} == START_TAG_TOKEN) {
818     !!!cp (61);
819     $self->{last_stag_name} = $self->{ct}->{tag_name};
820     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
821     !!!cp (62);
822     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
823     if ($self->{ct}->{attributes}) {
824     !!!parse-error (type => 'end tag attribute');
825     }
826     } else {
827     die "$0: $self->{ct}->{type}: Unknown token type";
828     }
829     $self->{state} = DATA_STATE;
830     !!!next-input-character;
831    
832     !!!emit ($self->{ct}); # start tag or end tag
833    
834     redo A;
835     } elsif (0x0041 <= $self->{nc} and
836     $self->{nc} <= 0x005A) { # A..Z
837     !!!cp (63);
838 wakaba 1.4 $self->{ca}->{name}
839     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
840 wakaba 1.1 ## Stay in the state
841     !!!next-input-character;
842     redo A;
843     } elsif ($self->{nc} == 0x002F) { # /
844     !!!cp (64);
845     $before_leave->();
846     $self->{state} = SELF_CLOSING_START_TAG_STATE;
847     !!!next-input-character;
848     redo A;
849     } elsif ($self->{nc} == -1) {
850     !!!parse-error (type => 'unclosed tag');
851     $before_leave->();
852     if ($self->{ct}->{type} == START_TAG_TOKEN) {
853     !!!cp (66);
854     $self->{last_stag_name} = $self->{ct}->{tag_name};
855     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
856     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
857     if ($self->{ct}->{attributes}) {
858     !!!cp (67);
859     !!!parse-error (type => 'end tag attribute');
860     } else {
861     ## NOTE: This state should never be reached.
862     !!!cp (68);
863     }
864     } else {
865     die "$0: $self->{ct}->{type}: Unknown token type";
866     }
867     $self->{state} = DATA_STATE;
868     # reconsume
869    
870     !!!emit ($self->{ct}); # start tag or end tag
871    
872     redo A;
873     } else {
874     if ($self->{nc} == 0x0022 or # "
875     $self->{nc} == 0x0027) { # '
876     !!!cp (69);
877     !!!parse-error (type => 'bad attribute name');
878     } else {
879     !!!cp (70);
880     }
881     $self->{ca}->{name} .= chr ($self->{nc});
882     ## Stay in the state
883     !!!next-input-character;
884     redo A;
885     }
886     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
887     if ($is_space->{$self->{nc}}) {
888     !!!cp (71);
889     ## Stay in the state
890     !!!next-input-character;
891     redo A;
892     } elsif ($self->{nc} == 0x003D) { # =
893     !!!cp (72);
894     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
895     !!!next-input-character;
896     redo A;
897     } elsif ($self->{nc} == 0x003E) { # >
898     if ($self->{ct}->{type} == START_TAG_TOKEN) {
899     !!!cp (73);
900     $self->{last_stag_name} = $self->{ct}->{tag_name};
901     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
902     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
903     if ($self->{ct}->{attributes}) {
904     !!!cp (74);
905     !!!parse-error (type => 'end tag attribute');
906     } else {
907     ## NOTE: This state should never be reached.
908     !!!cp (75);
909     }
910     } else {
911     die "$0: $self->{ct}->{type}: Unknown token type";
912     }
913     $self->{state} = DATA_STATE;
914     !!!next-input-character;
915    
916     !!!emit ($self->{ct}); # start tag or end tag
917    
918     redo A;
919     } elsif (0x0041 <= $self->{nc} and
920     $self->{nc} <= 0x005A) { # A..Z
921     !!!cp (76);
922     $self->{ca}
923 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
924 wakaba 1.1 value => '',
925     line => $self->{line}, column => $self->{column}};
926     $self->{state} = ATTRIBUTE_NAME_STATE;
927     !!!next-input-character;
928     redo A;
929     } elsif ($self->{nc} == 0x002F) { # /
930     !!!cp (77);
931     $self->{state} = SELF_CLOSING_START_TAG_STATE;
932     !!!next-input-character;
933     redo A;
934     } elsif ($self->{nc} == -1) {
935     !!!parse-error (type => 'unclosed tag');
936     if ($self->{ct}->{type} == START_TAG_TOKEN) {
937     !!!cp (79);
938     $self->{last_stag_name} = $self->{ct}->{tag_name};
939     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
940     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
941     if ($self->{ct}->{attributes}) {
942     !!!cp (80);
943     !!!parse-error (type => 'end tag attribute');
944     } else {
945     ## NOTE: This state should never be reached.
946     !!!cp (81);
947     }
948     } else {
949     die "$0: $self->{ct}->{type}: Unknown token type";
950     }
951     $self->{state} = DATA_STATE;
952     # reconsume
953    
954     !!!emit ($self->{ct}); # start tag or end tag
955    
956     redo A;
957     } else {
958     if ($self->{nc} == 0x0022 or # "
959     $self->{nc} == 0x0027) { # '
960     !!!cp (78);
961     !!!parse-error (type => 'bad attribute name');
962     } else {
963     !!!cp (82);
964     }
965     $self->{ca}
966     = {name => chr ($self->{nc}),
967     value => '',
968     line => $self->{line}, column => $self->{column}};
969     $self->{state} = ATTRIBUTE_NAME_STATE;
970     !!!next-input-character;
971     redo A;
972     }
973     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
974     if ($is_space->{$self->{nc}}) {
975     !!!cp (83);
976     ## Stay in the state
977     !!!next-input-character;
978     redo A;
979     } elsif ($self->{nc} == 0x0022) { # "
980     !!!cp (84);
981     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
982     !!!next-input-character;
983     redo A;
984     } elsif ($self->{nc} == 0x0026) { # &
985     !!!cp (85);
986     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
987     ## reconsume
988     redo A;
989     } elsif ($self->{nc} == 0x0027) { # '
990     !!!cp (86);
991     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
992     !!!next-input-character;
993     redo A;
994     } elsif ($self->{nc} == 0x003E) { # >
995     !!!parse-error (type => 'empty unquoted attribute value');
996     if ($self->{ct}->{type} == START_TAG_TOKEN) {
997     !!!cp (87);
998     $self->{last_stag_name} = $self->{ct}->{tag_name};
999     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1000     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1001     if ($self->{ct}->{attributes}) {
1002     !!!cp (88);
1003     !!!parse-error (type => 'end tag attribute');
1004     } else {
1005     ## NOTE: This state should never be reached.
1006     !!!cp (89);
1007     }
1008     } else {
1009     die "$0: $self->{ct}->{type}: Unknown token type";
1010     }
1011     $self->{state} = DATA_STATE;
1012     !!!next-input-character;
1013    
1014     !!!emit ($self->{ct}); # start tag or end tag
1015    
1016     redo A;
1017     } elsif ($self->{nc} == -1) {
1018     !!!parse-error (type => 'unclosed tag');
1019     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1020     !!!cp (90);
1021     $self->{last_stag_name} = $self->{ct}->{tag_name};
1022     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1023     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1024     if ($self->{ct}->{attributes}) {
1025     !!!cp (91);
1026     !!!parse-error (type => 'end tag attribute');
1027     } else {
1028     ## NOTE: This state should never be reached.
1029     !!!cp (92);
1030     }
1031     } else {
1032     die "$0: $self->{ct}->{type}: Unknown token type";
1033     }
1034     $self->{state} = DATA_STATE;
1035     ## reconsume
1036    
1037     !!!emit ($self->{ct}); # start tag or end tag
1038    
1039     redo A;
1040     } else {
1041     if ($self->{nc} == 0x003D) { # =
1042     !!!cp (93);
1043     !!!parse-error (type => 'bad attribute value');
1044     } else {
1045     !!!cp (94);
1046     }
1047     $self->{ca}->{value} .= chr ($self->{nc});
1048     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1049     !!!next-input-character;
1050     redo A;
1051     }
1052     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1053     if ($self->{nc} == 0x0022) { # "
1054     !!!cp (95);
1055     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1056     !!!next-input-character;
1057     redo A;
1058     } elsif ($self->{nc} == 0x0026) { # &
1059     !!!cp (96);
1060     ## NOTE: In the spec, the tokenizer is switched to the
1061     ## "entity in attribute value state". In this implementation, the
1062     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1063     ## implementation of the "consume a character reference" algorithm.
1064     $self->{prev_state} = $self->{state};
1065     $self->{entity_add} = 0x0022; # "
1066     $self->{state} = ENTITY_STATE;
1067     !!!next-input-character;
1068     redo A;
1069     } elsif ($self->{nc} == -1) {
1070     !!!parse-error (type => 'unclosed attribute value');
1071     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1072     !!!cp (97);
1073     $self->{last_stag_name} = $self->{ct}->{tag_name};
1074     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1075     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1076     if ($self->{ct}->{attributes}) {
1077     !!!cp (98);
1078     !!!parse-error (type => 'end tag attribute');
1079     } else {
1080     ## NOTE: This state should never be reached.
1081     !!!cp (99);
1082     }
1083     } else {
1084     die "$0: $self->{ct}->{type}: Unknown token type";
1085     }
1086     $self->{state} = DATA_STATE;
1087     ## reconsume
1088    
1089     !!!emit ($self->{ct}); # start tag or end tag
1090    
1091     redo A;
1092     } else {
1093     !!!cp (100);
1094     $self->{ca}->{value} .= chr ($self->{nc});
1095     $self->{read_until}->($self->{ca}->{value},
1096     q["&],
1097     length $self->{ca}->{value});
1098    
1099     ## Stay in the state
1100     !!!next-input-character;
1101     redo A;
1102     }
1103     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1104     if ($self->{nc} == 0x0027) { # '
1105     !!!cp (101);
1106     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1107     !!!next-input-character;
1108     redo A;
1109     } elsif ($self->{nc} == 0x0026) { # &
1110     !!!cp (102);
1111     ## NOTE: In the spec, the tokenizer is switched to the
1112     ## "entity in attribute value state". In this implementation, the
1113     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1114     ## implementation of the "consume a character reference" algorithm.
1115     $self->{entity_add} = 0x0027; # '
1116     $self->{prev_state} = $self->{state};
1117     $self->{state} = ENTITY_STATE;
1118     !!!next-input-character;
1119     redo A;
1120     } elsif ($self->{nc} == -1) {
1121     !!!parse-error (type => 'unclosed attribute value');
1122     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1123     !!!cp (103);
1124     $self->{last_stag_name} = $self->{ct}->{tag_name};
1125     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1126     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1127     if ($self->{ct}->{attributes}) {
1128     !!!cp (104);
1129     !!!parse-error (type => 'end tag attribute');
1130     } else {
1131     ## NOTE: This state should never be reached.
1132     !!!cp (105);
1133     }
1134     } else {
1135     die "$0: $self->{ct}->{type}: Unknown token type";
1136     }
1137     $self->{state} = DATA_STATE;
1138     ## reconsume
1139    
1140     !!!emit ($self->{ct}); # start tag or end tag
1141    
1142     redo A;
1143     } else {
1144     !!!cp (106);
1145     $self->{ca}->{value} .= chr ($self->{nc});
1146     $self->{read_until}->($self->{ca}->{value},
1147     q['&],
1148     length $self->{ca}->{value});
1149    
1150     ## Stay in the state
1151     !!!next-input-character;
1152     redo A;
1153     }
1154     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1155     if ($is_space->{$self->{nc}}) {
1156     !!!cp (107);
1157     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1158     !!!next-input-character;
1159     redo A;
1160     } elsif ($self->{nc} == 0x0026) { # &
1161     !!!cp (108);
1162     ## NOTE: In the spec, the tokenizer is switched to the
1163     ## "entity in attribute value state". In this implementation, the
1164     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1165     ## implementation of the "consume a character reference" algorithm.
1166     $self->{entity_add} = -1;
1167     $self->{prev_state} = $self->{state};
1168     $self->{state} = ENTITY_STATE;
1169     !!!next-input-character;
1170     redo A;
1171     } elsif ($self->{nc} == 0x003E) { # >
1172     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1173     !!!cp (109);
1174     $self->{last_stag_name} = $self->{ct}->{tag_name};
1175     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1176     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1177     if ($self->{ct}->{attributes}) {
1178     !!!cp (110);
1179     !!!parse-error (type => 'end tag attribute');
1180     } else {
1181     ## NOTE: This state should never be reached.
1182     !!!cp (111);
1183     }
1184     } else {
1185     die "$0: $self->{ct}->{type}: Unknown token type";
1186     }
1187     $self->{state} = DATA_STATE;
1188     !!!next-input-character;
1189    
1190     !!!emit ($self->{ct}); # start tag or end tag
1191    
1192     redo A;
1193     } elsif ($self->{nc} == -1) {
1194     !!!parse-error (type => 'unclosed tag');
1195     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1196     !!!cp (112);
1197     $self->{last_stag_name} = $self->{ct}->{tag_name};
1198     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1199     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1200     if ($self->{ct}->{attributes}) {
1201     !!!cp (113);
1202     !!!parse-error (type => 'end tag attribute');
1203     } else {
1204     ## NOTE: This state should never be reached.
1205     !!!cp (114);
1206     }
1207     } else {
1208     die "$0: $self->{ct}->{type}: Unknown token type";
1209     }
1210     $self->{state} = DATA_STATE;
1211     ## reconsume
1212    
1213     !!!emit ($self->{ct}); # start tag or end tag
1214    
1215     redo A;
1216     } else {
1217     if ({
1218     0x0022 => 1, # "
1219     0x0027 => 1, # '
1220     0x003D => 1, # =
1221     }->{$self->{nc}}) {
1222     !!!cp (115);
1223     !!!parse-error (type => 'bad attribute value');
1224     } else {
1225     !!!cp (116);
1226     }
1227     $self->{ca}->{value} .= chr ($self->{nc});
1228     $self->{read_until}->($self->{ca}->{value},
1229     q["'=& >],
1230     length $self->{ca}->{value});
1231    
1232     ## Stay in the state
1233     !!!next-input-character;
1234     redo A;
1235     }
1236     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1237     if ($is_space->{$self->{nc}}) {
1238     !!!cp (118);
1239     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1240     !!!next-input-character;
1241     redo A;
1242     } elsif ($self->{nc} == 0x003E) { # >
1243     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244     !!!cp (119);
1245     $self->{last_stag_name} = $self->{ct}->{tag_name};
1246     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1248     if ($self->{ct}->{attributes}) {
1249     !!!cp (120);
1250     !!!parse-error (type => 'end tag attribute');
1251     } else {
1252     ## NOTE: This state should never be reached.
1253     !!!cp (121);
1254     }
1255     } else {
1256     die "$0: $self->{ct}->{type}: Unknown token type";
1257     }
1258     $self->{state} = DATA_STATE;
1259     !!!next-input-character;
1260    
1261     !!!emit ($self->{ct}); # start tag or end tag
1262    
1263     redo A;
1264     } elsif ($self->{nc} == 0x002F) { # /
1265     !!!cp (122);
1266     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1267     !!!next-input-character;
1268     redo A;
1269     } elsif ($self->{nc} == -1) {
1270     !!!parse-error (type => 'unclosed tag');
1271     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1272     !!!cp (122.3);
1273     $self->{last_stag_name} = $self->{ct}->{tag_name};
1274     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1275     if ($self->{ct}->{attributes}) {
1276     !!!cp (122.1);
1277     !!!parse-error (type => 'end tag attribute');
1278     } else {
1279     ## NOTE: This state should never be reached.
1280     !!!cp (122.2);
1281     }
1282     } else {
1283     die "$0: $self->{ct}->{type}: Unknown token type";
1284     }
1285     $self->{state} = DATA_STATE;
1286     ## Reconsume.
1287     !!!emit ($self->{ct}); # start tag or end tag
1288     redo A;
1289     } else {
1290     !!!cp ('124.1');
1291     !!!parse-error (type => 'no space between attributes');
1292     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1293     ## reconsume
1294     redo A;
1295     }
1296     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1297     if ($self->{nc} == 0x003E) { # >
1298     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1299     !!!cp ('124.2');
1300     !!!parse-error (type => 'nestc', token => $self->{ct});
1301     ## TODO: Different type than slash in start tag
1302     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1303     if ($self->{ct}->{attributes}) {
1304     !!!cp ('124.4');
1305     !!!parse-error (type => 'end tag attribute');
1306     } else {
1307     !!!cp ('124.5');
1308     }
1309     ## TODO: Test |<title></title/>|
1310     } else {
1311     !!!cp ('124.3');
1312     $self->{self_closing} = 1;
1313     }
1314    
1315     $self->{state} = DATA_STATE;
1316     !!!next-input-character;
1317    
1318     !!!emit ($self->{ct}); # start tag or end tag
1319    
1320     redo A;
1321     } elsif ($self->{nc} == -1) {
1322     !!!parse-error (type => 'unclosed tag');
1323     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1324     !!!cp (124.7);
1325     $self->{last_stag_name} = $self->{ct}->{tag_name};
1326     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1327     if ($self->{ct}->{attributes}) {
1328     !!!cp (124.5);
1329     !!!parse-error (type => 'end tag attribute');
1330     } else {
1331     ## NOTE: This state should never be reached.
1332     !!!cp (124.6);
1333     }
1334     } else {
1335     die "$0: $self->{ct}->{type}: Unknown token type";
1336     }
1337     $self->{state} = DATA_STATE;
1338     ## Reconsume.
1339     !!!emit ($self->{ct}); # start tag or end tag
1340     redo A;
1341     } else {
1342     !!!cp ('124.4');
1343     !!!parse-error (type => 'nestc');
1344     ## TODO: This error type is wrong.
1345     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346     ## Reconsume.
1347     redo A;
1348     }
1349     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1350     ## (only happen if PCDATA state)
1351    
1352     ## NOTE: Unlike spec's "bogus comment state", this implementation
1353     ## consumes characters one-by-one basis.
1354    
1355     if ($self->{nc} == 0x003E) { # >
1356     !!!cp (124);
1357     $self->{state} = DATA_STATE;
1358     !!!next-input-character;
1359    
1360     !!!emit ($self->{ct}); # comment
1361     redo A;
1362     } elsif ($self->{nc} == -1) {
1363     !!!cp (125);
1364     $self->{state} = DATA_STATE;
1365     ## reconsume
1366    
1367     !!!emit ($self->{ct}); # comment
1368     redo A;
1369     } else {
1370     !!!cp (126);
1371     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1372     $self->{read_until}->($self->{ct}->{data},
1373     q[>],
1374     length $self->{ct}->{data});
1375    
1376     ## Stay in the state.
1377     !!!next-input-character;
1378     redo A;
1379     }
1380     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1381     ## (only happen if PCDATA state)
1382    
1383     if ($self->{nc} == 0x002D) { # -
1384     !!!cp (133);
1385     $self->{state} = MD_HYPHEN_STATE;
1386     !!!next-input-character;
1387     redo A;
1388     } elsif ($self->{nc} == 0x0044 or # D
1389     $self->{nc} == 0x0064) { # d
1390     ## ASCII case-insensitive.
1391     !!!cp (130);
1392     $self->{state} = MD_DOCTYPE_STATE;
1393     $self->{s_kwd} = chr $self->{nc};
1394     !!!next-input-character;
1395     redo A;
1396 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1397     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1398     $self->{is_xml}) and
1399 wakaba 1.1 $self->{nc} == 0x005B) { # [
1400     !!!cp (135.4);
1401     $self->{state} = MD_CDATA_STATE;
1402     $self->{s_kwd} = '[';
1403     !!!next-input-character;
1404     redo A;
1405     } else {
1406     !!!cp (136);
1407     }
1408    
1409     !!!parse-error (type => 'bogus comment',
1410     line => $self->{line_prev},
1411     column => $self->{column_prev} - 1);
1412     ## Reconsume.
1413     $self->{state} = BOGUS_COMMENT_STATE;
1414     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1415     line => $self->{line_prev},
1416     column => $self->{column_prev} - 1,
1417     };
1418     redo A;
1419     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1420     if ($self->{nc} == 0x002D) { # -
1421     !!!cp (127);
1422     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1423     line => $self->{line_prev},
1424     column => $self->{column_prev} - 2,
1425     };
1426     $self->{state} = COMMENT_START_STATE;
1427     !!!next-input-character;
1428     redo A;
1429     } else {
1430     !!!cp (128);
1431     !!!parse-error (type => 'bogus comment',
1432     line => $self->{line_prev},
1433     column => $self->{column_prev} - 2);
1434     $self->{state} = BOGUS_COMMENT_STATE;
1435     ## Reconsume.
1436     $self->{ct} = {type => COMMENT_TOKEN,
1437     data => '-',
1438     line => $self->{line_prev},
1439     column => $self->{column_prev} - 2,
1440     };
1441     redo A;
1442     }
1443     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1444     ## ASCII case-insensitive.
1445     if ($self->{nc} == [
1446     undef,
1447     0x004F, # O
1448     0x0043, # C
1449     0x0054, # T
1450     0x0059, # Y
1451     0x0050, # P
1452     ]->[length $self->{s_kwd}] or
1453     $self->{nc} == [
1454     undef,
1455     0x006F, # o
1456     0x0063, # c
1457     0x0074, # t
1458     0x0079, # y
1459     0x0070, # p
1460     ]->[length $self->{s_kwd}]) {
1461     !!!cp (131);
1462     ## Stay in the state.
1463     $self->{s_kwd} .= chr $self->{nc};
1464     !!!next-input-character;
1465     redo A;
1466     } elsif ((length $self->{s_kwd}) == 6 and
1467     ($self->{nc} == 0x0045 or # E
1468     $self->{nc} == 0x0065)) { # e
1469     !!!cp (129);
1470     $self->{state} = DOCTYPE_STATE;
1471     $self->{ct} = {type => DOCTYPE_TOKEN,
1472     quirks => 1,
1473     line => $self->{line_prev},
1474     column => $self->{column_prev} - 7,
1475     };
1476     !!!next-input-character;
1477     redo A;
1478     } else {
1479     !!!cp (132);
1480     !!!parse-error (type => 'bogus comment',
1481     line => $self->{line_prev},
1482     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1483     $self->{state} = BOGUS_COMMENT_STATE;
1484     ## Reconsume.
1485     $self->{ct} = {type => COMMENT_TOKEN,
1486     data => $self->{s_kwd},
1487     line => $self->{line_prev},
1488     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1489     };
1490     redo A;
1491     }
1492     } elsif ($self->{state} == MD_CDATA_STATE) {
1493     if ($self->{nc} == {
1494     '[' => 0x0043, # C
1495     '[C' => 0x0044, # D
1496     '[CD' => 0x0041, # A
1497     '[CDA' => 0x0054, # T
1498     '[CDAT' => 0x0041, # A
1499     }->{$self->{s_kwd}}) {
1500     !!!cp (135.1);
1501     ## Stay in the state.
1502     $self->{s_kwd} .= chr $self->{nc};
1503     !!!next-input-character;
1504     redo A;
1505     } elsif ($self->{s_kwd} eq '[CDATA' and
1506     $self->{nc} == 0x005B) { # [
1507     !!!cp (135.2);
1508     $self->{ct} = {type => CHARACTER_TOKEN,
1509     data => '',
1510     line => $self->{line_prev},
1511     column => $self->{column_prev} - 7};
1512     $self->{state} = CDATA_SECTION_STATE;
1513     !!!next-input-character;
1514     redo A;
1515     } else {
1516     !!!cp (135.3);
1517     !!!parse-error (type => 'bogus comment',
1518     line => $self->{line_prev},
1519     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1520     $self->{state} = BOGUS_COMMENT_STATE;
1521     ## Reconsume.
1522     $self->{ct} = {type => COMMENT_TOKEN,
1523     data => $self->{s_kwd},
1524     line => $self->{line_prev},
1525     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1526     };
1527     redo A;
1528     }
1529     } elsif ($self->{state} == COMMENT_START_STATE) {
1530     if ($self->{nc} == 0x002D) { # -
1531     !!!cp (137);
1532     $self->{state} = COMMENT_START_DASH_STATE;
1533     !!!next-input-character;
1534     redo A;
1535     } elsif ($self->{nc} == 0x003E) { # >
1536     !!!cp (138);
1537     !!!parse-error (type => 'bogus comment');
1538     $self->{state} = DATA_STATE;
1539     !!!next-input-character;
1540    
1541     !!!emit ($self->{ct}); # comment
1542    
1543     redo A;
1544     } elsif ($self->{nc} == -1) {
1545     !!!cp (139);
1546     !!!parse-error (type => 'unclosed comment');
1547     $self->{state} = DATA_STATE;
1548     ## reconsume
1549    
1550     !!!emit ($self->{ct}); # comment
1551    
1552     redo A;
1553     } else {
1554     !!!cp (140);
1555     $self->{ct}->{data} # comment
1556     .= chr ($self->{nc});
1557     $self->{state} = COMMENT_STATE;
1558     !!!next-input-character;
1559     redo A;
1560     }
1561     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1562     if ($self->{nc} == 0x002D) { # -
1563     !!!cp (141);
1564     $self->{state} = COMMENT_END_STATE;
1565     !!!next-input-character;
1566     redo A;
1567     } elsif ($self->{nc} == 0x003E) { # >
1568     !!!cp (142);
1569     !!!parse-error (type => 'bogus comment');
1570     $self->{state} = DATA_STATE;
1571     !!!next-input-character;
1572    
1573     !!!emit ($self->{ct}); # comment
1574    
1575     redo A;
1576     } elsif ($self->{nc} == -1) {
1577     !!!cp (143);
1578     !!!parse-error (type => 'unclosed comment');
1579     $self->{state} = DATA_STATE;
1580     ## reconsume
1581    
1582     !!!emit ($self->{ct}); # comment
1583    
1584     redo A;
1585     } else {
1586     !!!cp (144);
1587     $self->{ct}->{data} # comment
1588     .= '-' . chr ($self->{nc});
1589     $self->{state} = COMMENT_STATE;
1590     !!!next-input-character;
1591     redo A;
1592     }
1593     } elsif ($self->{state} == COMMENT_STATE) {
1594     if ($self->{nc} == 0x002D) { # -
1595     !!!cp (145);
1596     $self->{state} = COMMENT_END_DASH_STATE;
1597     !!!next-input-character;
1598     redo A;
1599     } elsif ($self->{nc} == -1) {
1600     !!!cp (146);
1601     !!!parse-error (type => 'unclosed comment');
1602     $self->{state} = DATA_STATE;
1603     ## reconsume
1604    
1605     !!!emit ($self->{ct}); # comment
1606    
1607     redo A;
1608     } else {
1609     !!!cp (147);
1610     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1611     $self->{read_until}->($self->{ct}->{data},
1612     q[-],
1613     length $self->{ct}->{data});
1614    
1615     ## Stay in the state
1616     !!!next-input-character;
1617     redo A;
1618     }
1619     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1620     if ($self->{nc} == 0x002D) { # -
1621     !!!cp (148);
1622     $self->{state} = COMMENT_END_STATE;
1623     !!!next-input-character;
1624     redo A;
1625     } elsif ($self->{nc} == -1) {
1626     !!!cp (149);
1627     !!!parse-error (type => 'unclosed comment');
1628     $self->{state} = DATA_STATE;
1629     ## reconsume
1630    
1631     !!!emit ($self->{ct}); # comment
1632    
1633     redo A;
1634     } else {
1635     !!!cp (150);
1636     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1637     $self->{state} = COMMENT_STATE;
1638     !!!next-input-character;
1639     redo A;
1640     }
1641     } elsif ($self->{state} == COMMENT_END_STATE) {
1642     if ($self->{nc} == 0x003E) { # >
1643     !!!cp (151);
1644     $self->{state} = DATA_STATE;
1645     !!!next-input-character;
1646    
1647     !!!emit ($self->{ct}); # comment
1648    
1649     redo A;
1650     } elsif ($self->{nc} == 0x002D) { # -
1651     !!!cp (152);
1652     !!!parse-error (type => 'dash in comment',
1653     line => $self->{line_prev},
1654     column => $self->{column_prev});
1655     $self->{ct}->{data} .= '-'; # comment
1656     ## Stay in the state
1657     !!!next-input-character;
1658     redo A;
1659     } elsif ($self->{nc} == -1) {
1660     !!!cp (153);
1661     !!!parse-error (type => 'unclosed comment');
1662     $self->{state} = DATA_STATE;
1663     ## reconsume
1664    
1665     !!!emit ($self->{ct}); # comment
1666    
1667     redo A;
1668     } else {
1669     !!!cp (154);
1670     !!!parse-error (type => 'dash in comment',
1671     line => $self->{line_prev},
1672     column => $self->{column_prev});
1673     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1674     $self->{state} = COMMENT_STATE;
1675     !!!next-input-character;
1676     redo A;
1677     }
1678     } elsif ($self->{state} == DOCTYPE_STATE) {
1679     if ($is_space->{$self->{nc}}) {
1680     !!!cp (155);
1681     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1682     !!!next-input-character;
1683     redo A;
1684     } else {
1685     !!!cp (156);
1686     !!!parse-error (type => 'no space before DOCTYPE name');
1687     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1688     ## reconsume
1689     redo A;
1690     }
1691     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1692     if ($is_space->{$self->{nc}}) {
1693     !!!cp (157);
1694     ## Stay in the state
1695     !!!next-input-character;
1696     redo A;
1697     } elsif ($self->{nc} == 0x003E) { # >
1698     !!!cp (158);
1699     !!!parse-error (type => 'no DOCTYPE name');
1700     $self->{state} = DATA_STATE;
1701     !!!next-input-character;
1702    
1703     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1704    
1705     redo A;
1706     } elsif ($self->{nc} == -1) {
1707     !!!cp (159);
1708     !!!parse-error (type => 'no DOCTYPE name');
1709     $self->{state} = DATA_STATE;
1710     ## reconsume
1711    
1712     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1713    
1714     redo A;
1715     } else {
1716     !!!cp (160);
1717     $self->{ct}->{name} = chr $self->{nc};
1718     delete $self->{ct}->{quirks};
1719     $self->{state} = DOCTYPE_NAME_STATE;
1720     !!!next-input-character;
1721     redo A;
1722     }
1723     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1724     ## ISSUE: Redundant "First," in the spec.
1725     if ($is_space->{$self->{nc}}) {
1726     !!!cp (161);
1727     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1728     !!!next-input-character;
1729     redo A;
1730     } elsif ($self->{nc} == 0x003E) { # >
1731     !!!cp (162);
1732     $self->{state} = DATA_STATE;
1733     !!!next-input-character;
1734    
1735     !!!emit ($self->{ct}); # DOCTYPE
1736    
1737     redo A;
1738     } elsif ($self->{nc} == -1) {
1739     !!!cp (163);
1740     !!!parse-error (type => 'unclosed DOCTYPE');
1741     $self->{state} = DATA_STATE;
1742     ## reconsume
1743    
1744     $self->{ct}->{quirks} = 1;
1745     !!!emit ($self->{ct}); # DOCTYPE
1746    
1747     redo A;
1748     } else {
1749     !!!cp (164);
1750     $self->{ct}->{name}
1751     .= chr ($self->{nc}); # DOCTYPE
1752     ## Stay in the state
1753     !!!next-input-character;
1754     redo A;
1755     }
1756     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1757     if ($is_space->{$self->{nc}}) {
1758     !!!cp (165);
1759     ## Stay in the state
1760     !!!next-input-character;
1761     redo A;
1762     } elsif ($self->{nc} == 0x003E) { # >
1763     !!!cp (166);
1764     $self->{state} = DATA_STATE;
1765     !!!next-input-character;
1766    
1767     !!!emit ($self->{ct}); # DOCTYPE
1768    
1769     redo A;
1770     } elsif ($self->{nc} == -1) {
1771     !!!cp (167);
1772     !!!parse-error (type => 'unclosed DOCTYPE');
1773     $self->{state} = DATA_STATE;
1774     ## reconsume
1775    
1776     $self->{ct}->{quirks} = 1;
1777     !!!emit ($self->{ct}); # DOCTYPE
1778    
1779     redo A;
1780     } elsif ($self->{nc} == 0x0050 or # P
1781     $self->{nc} == 0x0070) { # p
1782     $self->{state} = PUBLIC_STATE;
1783     $self->{s_kwd} = chr $self->{nc};
1784     !!!next-input-character;
1785     redo A;
1786     } elsif ($self->{nc} == 0x0053 or # S
1787     $self->{nc} == 0x0073) { # s
1788     $self->{state} = SYSTEM_STATE;
1789     $self->{s_kwd} = chr $self->{nc};
1790     !!!next-input-character;
1791     redo A;
1792     } else {
1793     !!!cp (180);
1794     !!!parse-error (type => 'string after DOCTYPE name');
1795     $self->{ct}->{quirks} = 1;
1796    
1797     $self->{state} = BOGUS_DOCTYPE_STATE;
1798     !!!next-input-character;
1799     redo A;
1800     }
1801     } elsif ($self->{state} == PUBLIC_STATE) {
1802     ## ASCII case-insensitive
1803     if ($self->{nc} == [
1804     undef,
1805     0x0055, # U
1806     0x0042, # B
1807     0x004C, # L
1808     0x0049, # I
1809     ]->[length $self->{s_kwd}] or
1810     $self->{nc} == [
1811     undef,
1812     0x0075, # u
1813     0x0062, # b
1814     0x006C, # l
1815     0x0069, # i
1816     ]->[length $self->{s_kwd}]) {
1817     !!!cp (175);
1818     ## Stay in the state.
1819     $self->{s_kwd} .= chr $self->{nc};
1820     !!!next-input-character;
1821     redo A;
1822     } elsif ((length $self->{s_kwd}) == 5 and
1823     ($self->{nc} == 0x0043 or # C
1824     $self->{nc} == 0x0063)) { # c
1825     !!!cp (168);
1826     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1827     !!!next-input-character;
1828     redo A;
1829     } else {
1830     !!!cp (169);
1831     !!!parse-error (type => 'string after DOCTYPE name',
1832     line => $self->{line_prev},
1833     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1834     $self->{ct}->{quirks} = 1;
1835    
1836     $self->{state} = BOGUS_DOCTYPE_STATE;
1837     ## Reconsume.
1838     redo A;
1839     }
1840     } elsif ($self->{state} == SYSTEM_STATE) {
1841     ## ASCII case-insensitive
1842     if ($self->{nc} == [
1843     undef,
1844     0x0059, # Y
1845     0x0053, # S
1846     0x0054, # T
1847     0x0045, # E
1848     ]->[length $self->{s_kwd}] or
1849     $self->{nc} == [
1850     undef,
1851     0x0079, # y
1852     0x0073, # s
1853     0x0074, # t
1854     0x0065, # e
1855     ]->[length $self->{s_kwd}]) {
1856     !!!cp (170);
1857     ## Stay in the state.
1858     $self->{s_kwd} .= chr $self->{nc};
1859     !!!next-input-character;
1860     redo A;
1861     } elsif ((length $self->{s_kwd}) == 5 and
1862     ($self->{nc} == 0x004D or # M
1863     $self->{nc} == 0x006D)) { # m
1864     !!!cp (171);
1865     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1866     !!!next-input-character;
1867     redo A;
1868     } else {
1869     !!!cp (172);
1870     !!!parse-error (type => 'string after DOCTYPE name',
1871     line => $self->{line_prev},
1872     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1873     $self->{ct}->{quirks} = 1;
1874    
1875     $self->{state} = BOGUS_DOCTYPE_STATE;
1876     ## Reconsume.
1877     redo A;
1878     }
1879     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1880     if ($is_space->{$self->{nc}}) {
1881     !!!cp (181);
1882     ## Stay in the state
1883     !!!next-input-character;
1884     redo A;
1885     } elsif ($self->{nc} eq 0x0022) { # "
1886     !!!cp (182);
1887     $self->{ct}->{pubid} = ''; # DOCTYPE
1888     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1889     !!!next-input-character;
1890     redo A;
1891     } elsif ($self->{nc} eq 0x0027) { # '
1892     !!!cp (183);
1893     $self->{ct}->{pubid} = ''; # DOCTYPE
1894     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1895     !!!next-input-character;
1896     redo A;
1897     } elsif ($self->{nc} eq 0x003E) { # >
1898     !!!cp (184);
1899     !!!parse-error (type => 'no PUBLIC literal');
1900    
1901     $self->{state} = DATA_STATE;
1902     !!!next-input-character;
1903    
1904     $self->{ct}->{quirks} = 1;
1905     !!!emit ($self->{ct}); # DOCTYPE
1906    
1907     redo A;
1908     } elsif ($self->{nc} == -1) {
1909     !!!cp (185);
1910     !!!parse-error (type => 'unclosed DOCTYPE');
1911    
1912     $self->{state} = DATA_STATE;
1913     ## reconsume
1914    
1915     $self->{ct}->{quirks} = 1;
1916     !!!emit ($self->{ct}); # DOCTYPE
1917    
1918     redo A;
1919     } else {
1920     !!!cp (186);
1921     !!!parse-error (type => 'string after PUBLIC');
1922     $self->{ct}->{quirks} = 1;
1923    
1924     $self->{state} = BOGUS_DOCTYPE_STATE;
1925     !!!next-input-character;
1926     redo A;
1927     }
1928     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1929     if ($self->{nc} == 0x0022) { # "
1930     !!!cp (187);
1931     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1932     !!!next-input-character;
1933     redo A;
1934     } elsif ($self->{nc} == 0x003E) { # >
1935     !!!cp (188);
1936     !!!parse-error (type => 'unclosed PUBLIC literal');
1937    
1938     $self->{state} = DATA_STATE;
1939     !!!next-input-character;
1940    
1941     $self->{ct}->{quirks} = 1;
1942     !!!emit ($self->{ct}); # DOCTYPE
1943    
1944     redo A;
1945     } elsif ($self->{nc} == -1) {
1946     !!!cp (189);
1947     !!!parse-error (type => 'unclosed PUBLIC literal');
1948    
1949     $self->{state} = DATA_STATE;
1950     ## reconsume
1951    
1952     $self->{ct}->{quirks} = 1;
1953     !!!emit ($self->{ct}); # DOCTYPE
1954    
1955     redo A;
1956     } else {
1957     !!!cp (190);
1958     $self->{ct}->{pubid} # DOCTYPE
1959     .= chr $self->{nc};
1960     $self->{read_until}->($self->{ct}->{pubid}, q[">],
1961     length $self->{ct}->{pubid});
1962    
1963     ## Stay in the state
1964     !!!next-input-character;
1965     redo A;
1966     }
1967     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1968     if ($self->{nc} == 0x0027) { # '
1969     !!!cp (191);
1970     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1971     !!!next-input-character;
1972     redo A;
1973     } elsif ($self->{nc} == 0x003E) { # >
1974     !!!cp (192);
1975     !!!parse-error (type => 'unclosed PUBLIC literal');
1976    
1977     $self->{state} = DATA_STATE;
1978     !!!next-input-character;
1979    
1980     $self->{ct}->{quirks} = 1;
1981     !!!emit ($self->{ct}); # DOCTYPE
1982    
1983     redo A;
1984     } elsif ($self->{nc} == -1) {
1985     !!!cp (193);
1986     !!!parse-error (type => 'unclosed PUBLIC literal');
1987    
1988     $self->{state} = DATA_STATE;
1989     ## reconsume
1990    
1991     $self->{ct}->{quirks} = 1;
1992     !!!emit ($self->{ct}); # DOCTYPE
1993    
1994     redo A;
1995     } else {
1996     !!!cp (194);
1997     $self->{ct}->{pubid} # DOCTYPE
1998     .= chr $self->{nc};
1999     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2000     length $self->{ct}->{pubid});
2001    
2002     ## Stay in the state
2003     !!!next-input-character;
2004     redo A;
2005     }
2006     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2007     if ($is_space->{$self->{nc}}) {
2008     !!!cp (195);
2009     ## Stay in the state
2010     !!!next-input-character;
2011     redo A;
2012     } elsif ($self->{nc} == 0x0022) { # "
2013     !!!cp (196);
2014     $self->{ct}->{sysid} = ''; # DOCTYPE
2015     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2016     !!!next-input-character;
2017     redo A;
2018     } elsif ($self->{nc} == 0x0027) { # '
2019     !!!cp (197);
2020     $self->{ct}->{sysid} = ''; # DOCTYPE
2021     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2022     !!!next-input-character;
2023     redo A;
2024     } elsif ($self->{nc} == 0x003E) { # >
2025     !!!cp (198);
2026     $self->{state} = DATA_STATE;
2027     !!!next-input-character;
2028    
2029     !!!emit ($self->{ct}); # DOCTYPE
2030    
2031     redo A;
2032     } elsif ($self->{nc} == -1) {
2033     !!!cp (199);
2034     !!!parse-error (type => 'unclosed DOCTYPE');
2035    
2036     $self->{state} = DATA_STATE;
2037     ## reconsume
2038    
2039     $self->{ct}->{quirks} = 1;
2040     !!!emit ($self->{ct}); # DOCTYPE
2041    
2042     redo A;
2043     } else {
2044     !!!cp (200);
2045     !!!parse-error (type => 'string after PUBLIC literal');
2046     $self->{ct}->{quirks} = 1;
2047    
2048     $self->{state} = BOGUS_DOCTYPE_STATE;
2049     !!!next-input-character;
2050     redo A;
2051     }
2052     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2053     if ($is_space->{$self->{nc}}) {
2054     !!!cp (201);
2055     ## Stay in the state
2056     !!!next-input-character;
2057     redo A;
2058     } elsif ($self->{nc} == 0x0022) { # "
2059     !!!cp (202);
2060     $self->{ct}->{sysid} = ''; # DOCTYPE
2061     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2062     !!!next-input-character;
2063     redo A;
2064     } elsif ($self->{nc} == 0x0027) { # '
2065     !!!cp (203);
2066     $self->{ct}->{sysid} = ''; # DOCTYPE
2067     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2068     !!!next-input-character;
2069     redo A;
2070     } elsif ($self->{nc} == 0x003E) { # >
2071     !!!cp (204);
2072     !!!parse-error (type => 'no SYSTEM literal');
2073     $self->{state} = DATA_STATE;
2074     !!!next-input-character;
2075    
2076     $self->{ct}->{quirks} = 1;
2077     !!!emit ($self->{ct}); # DOCTYPE
2078    
2079     redo A;
2080     } elsif ($self->{nc} == -1) {
2081     !!!cp (205);
2082     !!!parse-error (type => 'unclosed DOCTYPE');
2083    
2084     $self->{state} = DATA_STATE;
2085     ## reconsume
2086    
2087     $self->{ct}->{quirks} = 1;
2088     !!!emit ($self->{ct}); # DOCTYPE
2089    
2090     redo A;
2091     } else {
2092     !!!cp (206);
2093     !!!parse-error (type => 'string after SYSTEM');
2094     $self->{ct}->{quirks} = 1;
2095    
2096     $self->{state} = BOGUS_DOCTYPE_STATE;
2097     !!!next-input-character;
2098     redo A;
2099     }
2100     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2101     if ($self->{nc} == 0x0022) { # "
2102     !!!cp (207);
2103     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2104     !!!next-input-character;
2105     redo A;
2106     } elsif ($self->{nc} == 0x003E) { # >
2107     !!!cp (208);
2108     !!!parse-error (type => 'unclosed SYSTEM literal');
2109    
2110     $self->{state} = DATA_STATE;
2111     !!!next-input-character;
2112    
2113     $self->{ct}->{quirks} = 1;
2114     !!!emit ($self->{ct}); # DOCTYPE
2115    
2116     redo A;
2117     } elsif ($self->{nc} == -1) {
2118     !!!cp (209);
2119     !!!parse-error (type => 'unclosed SYSTEM literal');
2120    
2121     $self->{state} = DATA_STATE;
2122     ## reconsume
2123    
2124     $self->{ct}->{quirks} = 1;
2125     !!!emit ($self->{ct}); # DOCTYPE
2126    
2127     redo A;
2128     } else {
2129     !!!cp (210);
2130     $self->{ct}->{sysid} # DOCTYPE
2131     .= chr $self->{nc};
2132     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2133     length $self->{ct}->{sysid});
2134    
2135     ## Stay in the state
2136     !!!next-input-character;
2137     redo A;
2138     }
2139     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2140     if ($self->{nc} == 0x0027) { # '
2141     !!!cp (211);
2142     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2143     !!!next-input-character;
2144     redo A;
2145     } elsif ($self->{nc} == 0x003E) { # >
2146     !!!cp (212);
2147     !!!parse-error (type => 'unclosed SYSTEM literal');
2148    
2149     $self->{state} = DATA_STATE;
2150     !!!next-input-character;
2151    
2152     $self->{ct}->{quirks} = 1;
2153     !!!emit ($self->{ct}); # DOCTYPE
2154    
2155     redo A;
2156     } elsif ($self->{nc} == -1) {
2157     !!!cp (213);
2158     !!!parse-error (type => 'unclosed SYSTEM literal');
2159    
2160     $self->{state} = DATA_STATE;
2161     ## reconsume
2162    
2163     $self->{ct}->{quirks} = 1;
2164     !!!emit ($self->{ct}); # DOCTYPE
2165    
2166     redo A;
2167     } else {
2168     !!!cp (214);
2169     $self->{ct}->{sysid} # DOCTYPE
2170     .= chr $self->{nc};
2171     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2172     length $self->{ct}->{sysid});
2173    
2174     ## Stay in the state
2175     !!!next-input-character;
2176     redo A;
2177     }
2178     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2179     if ($is_space->{$self->{nc}}) {
2180     !!!cp (215);
2181     ## Stay in the state
2182     !!!next-input-character;
2183     redo A;
2184     } elsif ($self->{nc} == 0x003E) { # >
2185     !!!cp (216);
2186     $self->{state} = DATA_STATE;
2187     !!!next-input-character;
2188    
2189     !!!emit ($self->{ct}); # DOCTYPE
2190    
2191     redo A;
2192     } elsif ($self->{nc} == -1) {
2193     !!!cp (217);
2194     !!!parse-error (type => 'unclosed DOCTYPE');
2195     $self->{state} = DATA_STATE;
2196     ## reconsume
2197    
2198     $self->{ct}->{quirks} = 1;
2199     !!!emit ($self->{ct}); # DOCTYPE
2200    
2201     redo A;
2202     } else {
2203     !!!cp (218);
2204     !!!parse-error (type => 'string after SYSTEM literal');
2205     #$self->{ct}->{quirks} = 1;
2206    
2207     $self->{state} = BOGUS_DOCTYPE_STATE;
2208     !!!next-input-character;
2209     redo A;
2210     }
2211     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2212     if ($self->{nc} == 0x003E) { # >
2213     !!!cp (219);
2214     $self->{state} = DATA_STATE;
2215     !!!next-input-character;
2216    
2217     !!!emit ($self->{ct}); # DOCTYPE
2218    
2219     redo A;
2220     } elsif ($self->{nc} == -1) {
2221     !!!cp (220);
2222     $self->{state} = DATA_STATE;
2223     ## reconsume
2224    
2225     !!!emit ($self->{ct}); # DOCTYPE
2226    
2227     redo A;
2228     } else {
2229     !!!cp (221);
2230     my $s = '';
2231     $self->{read_until}->($s, q[>], 0);
2232    
2233     ## Stay in the state
2234     !!!next-input-character;
2235     redo A;
2236     }
2237     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2238     ## NOTE: "CDATA section state" in the state is jointly implemented
2239     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2240     ## and |CDATA_SECTION_MSE2_STATE|.
2241    
2242     if ($self->{nc} == 0x005D) { # ]
2243     !!!cp (221.1);
2244     $self->{state} = CDATA_SECTION_MSE1_STATE;
2245     !!!next-input-character;
2246     redo A;
2247     } elsif ($self->{nc} == -1) {
2248     $self->{state} = DATA_STATE;
2249     !!!next-input-character;
2250     if (length $self->{ct}->{data}) { # character
2251     !!!cp (221.2);
2252     !!!emit ($self->{ct}); # character
2253     } else {
2254     !!!cp (221.3);
2255     ## No token to emit. $self->{ct} is discarded.
2256     }
2257     redo A;
2258     } else {
2259     !!!cp (221.4);
2260     $self->{ct}->{data} .= chr $self->{nc};
2261     $self->{read_until}->($self->{ct}->{data},
2262     q<]>,
2263     length $self->{ct}->{data});
2264    
2265     ## Stay in the state.
2266     !!!next-input-character;
2267     redo A;
2268     }
2269    
2270     ## ISSUE: "text tokens" in spec.
2271     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2272     if ($self->{nc} == 0x005D) { # ]
2273     !!!cp (221.5);
2274     $self->{state} = CDATA_SECTION_MSE2_STATE;
2275     !!!next-input-character;
2276     redo A;
2277     } else {
2278     !!!cp (221.6);
2279     $self->{ct}->{data} .= ']';
2280     $self->{state} = CDATA_SECTION_STATE;
2281     ## Reconsume.
2282     redo A;
2283     }
2284     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2285     if ($self->{nc} == 0x003E) { # >
2286     $self->{state} = DATA_STATE;
2287     !!!next-input-character;
2288     if (length $self->{ct}->{data}) { # character
2289     !!!cp (221.7);
2290     !!!emit ($self->{ct}); # character
2291     } else {
2292     !!!cp (221.8);
2293     ## No token to emit. $self->{ct} is discarded.
2294     }
2295     redo A;
2296     } elsif ($self->{nc} == 0x005D) { # ]
2297     !!!cp (221.9); # character
2298     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2299     ## Stay in the state.
2300     !!!next-input-character;
2301     redo A;
2302     } else {
2303     !!!cp (221.11);
2304     $self->{ct}->{data} .= ']]'; # character
2305     $self->{state} = CDATA_SECTION_STATE;
2306     ## Reconsume.
2307     redo A;
2308     }
2309     } elsif ($self->{state} == ENTITY_STATE) {
2310     if ($is_space->{$self->{nc}} or
2311     {
2312     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2313     $self->{entity_add} => 1,
2314     }->{$self->{nc}}) {
2315     !!!cp (1001);
2316     ## Don't consume
2317     ## No error
2318     ## Return nothing.
2319     #
2320     } elsif ($self->{nc} == 0x0023) { # #
2321     !!!cp (999);
2322     $self->{state} = ENTITY_HASH_STATE;
2323     $self->{s_kwd} = '#';
2324     !!!next-input-character;
2325     redo A;
2326     } elsif ((0x0041 <= $self->{nc} and
2327     $self->{nc} <= 0x005A) or # A..Z
2328     (0x0061 <= $self->{nc} and
2329     $self->{nc} <= 0x007A)) { # a..z
2330     !!!cp (998);
2331     require Whatpm::_NamedEntityList;
2332     $self->{state} = ENTITY_NAME_STATE;
2333     $self->{s_kwd} = chr $self->{nc};
2334     $self->{entity__value} = $self->{s_kwd};
2335     $self->{entity__match} = 0;
2336     !!!next-input-character;
2337     redo A;
2338     } else {
2339     !!!cp (1027);
2340     !!!parse-error (type => 'bare ero');
2341     ## Return nothing.
2342     #
2343     }
2344    
2345     ## NOTE: No character is consumed by the "consume a character
2346     ## reference" algorithm. In other word, there is an "&" character
2347     ## that does not introduce a character reference, which would be
2348     ## appended to the parent element or the attribute value in later
2349     ## process of the tokenizer.
2350    
2351     if ($self->{prev_state} == DATA_STATE) {
2352     !!!cp (997);
2353     $self->{state} = $self->{prev_state};
2354     ## Reconsume.
2355     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2356     line => $self->{line_prev},
2357     column => $self->{column_prev},
2358     });
2359     redo A;
2360     } else {
2361     !!!cp (996);
2362     $self->{ca}->{value} .= '&';
2363     $self->{state} = $self->{prev_state};
2364     ## Reconsume.
2365     redo A;
2366     }
2367     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2368     if ($self->{nc} == 0x0078 or # x
2369     $self->{nc} == 0x0058) { # X
2370     !!!cp (995);
2371     $self->{state} = HEXREF_X_STATE;
2372     $self->{s_kwd} .= chr $self->{nc};
2373     !!!next-input-character;
2374     redo A;
2375     } elsif (0x0030 <= $self->{nc} and
2376     $self->{nc} <= 0x0039) { # 0..9
2377     !!!cp (994);
2378     $self->{state} = NCR_NUM_STATE;
2379     $self->{s_kwd} = $self->{nc} - 0x0030;
2380     !!!next-input-character;
2381     redo A;
2382     } else {
2383     !!!parse-error (type => 'bare nero',
2384     line => $self->{line_prev},
2385     column => $self->{column_prev} - 1);
2386    
2387     ## NOTE: According to the spec algorithm, nothing is returned,
2388     ## and then "&#" is appended to the parent element or the attribute
2389     ## value in the later processing.
2390    
2391     if ($self->{prev_state} == DATA_STATE) {
2392     !!!cp (1019);
2393     $self->{state} = $self->{prev_state};
2394     ## Reconsume.
2395     !!!emit ({type => CHARACTER_TOKEN,
2396     data => '&#',
2397     line => $self->{line_prev},
2398     column => $self->{column_prev} - 1,
2399     });
2400     redo A;
2401     } else {
2402     !!!cp (993);
2403     $self->{ca}->{value} .= '&#';
2404     $self->{state} = $self->{prev_state};
2405     ## Reconsume.
2406     redo A;
2407     }
2408     }
2409     } elsif ($self->{state} == NCR_NUM_STATE) {
2410     if (0x0030 <= $self->{nc} and
2411     $self->{nc} <= 0x0039) { # 0..9
2412     !!!cp (1012);
2413     $self->{s_kwd} *= 10;
2414     $self->{s_kwd} += $self->{nc} - 0x0030;
2415    
2416     ## Stay in the state.
2417     !!!next-input-character;
2418     redo A;
2419     } elsif ($self->{nc} == 0x003B) { # ;
2420     !!!cp (1013);
2421     !!!next-input-character;
2422     #
2423     } else {
2424     !!!cp (1014);
2425     !!!parse-error (type => 'no refc');
2426     ## Reconsume.
2427     #
2428     }
2429    
2430     my $code = $self->{s_kwd};
2431     my $l = $self->{line_prev};
2432     my $c = $self->{column_prev};
2433     if ($charref_map->{$code}) {
2434     !!!cp (1015);
2435     !!!parse-error (type => 'invalid character reference',
2436     text => (sprintf 'U+%04X', $code),
2437     line => $l, column => $c);
2438     $code = $charref_map->{$code};
2439     } elsif ($code > 0x10FFFF) {
2440     !!!cp (1016);
2441     !!!parse-error (type => 'invalid character reference',
2442     text => (sprintf 'U-%08X', $code),
2443     line => $l, column => $c);
2444     $code = 0xFFFD;
2445     }
2446    
2447     if ($self->{prev_state} == DATA_STATE) {
2448     !!!cp (992);
2449     $self->{state} = $self->{prev_state};
2450     ## Reconsume.
2451     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2452     line => $l, column => $c,
2453     });
2454     redo A;
2455     } else {
2456     !!!cp (991);
2457     $self->{ca}->{value} .= chr $code;
2458     $self->{ca}->{has_reference} = 1;
2459     $self->{state} = $self->{prev_state};
2460     ## Reconsume.
2461     redo A;
2462     }
2463     } elsif ($self->{state} == HEXREF_X_STATE) {
2464     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2465     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2466     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2467     # 0..9, A..F, a..f
2468     !!!cp (990);
2469     $self->{state} = HEXREF_HEX_STATE;
2470     $self->{s_kwd} = 0;
2471     ## Reconsume.
2472     redo A;
2473     } else {
2474     !!!parse-error (type => 'bare hcro',
2475     line => $self->{line_prev},
2476     column => $self->{column_prev} - 2);
2477    
2478     ## NOTE: According to the spec algorithm, nothing is returned,
2479     ## and then "&#" followed by "X" or "x" is appended to the parent
2480     ## element or the attribute value in the later processing.
2481    
2482     if ($self->{prev_state} == DATA_STATE) {
2483     !!!cp (1005);
2484     $self->{state} = $self->{prev_state};
2485     ## Reconsume.
2486     !!!emit ({type => CHARACTER_TOKEN,
2487     data => '&' . $self->{s_kwd},
2488     line => $self->{line_prev},
2489     column => $self->{column_prev} - length $self->{s_kwd},
2490     });
2491     redo A;
2492     } else {
2493     !!!cp (989);
2494     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2495     $self->{state} = $self->{prev_state};
2496     ## Reconsume.
2497     redo A;
2498     }
2499     }
2500     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2501     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2502     # 0..9
2503     !!!cp (1002);
2504     $self->{s_kwd} *= 0x10;
2505     $self->{s_kwd} += $self->{nc} - 0x0030;
2506     ## Stay in the state.
2507     !!!next-input-character;
2508     redo A;
2509     } elsif (0x0061 <= $self->{nc} and
2510     $self->{nc} <= 0x0066) { # a..f
2511     !!!cp (1003);
2512     $self->{s_kwd} *= 0x10;
2513     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2514     ## Stay in the state.
2515     !!!next-input-character;
2516     redo A;
2517     } elsif (0x0041 <= $self->{nc} and
2518     $self->{nc} <= 0x0046) { # A..F
2519     !!!cp (1004);
2520     $self->{s_kwd} *= 0x10;
2521     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2522     ## Stay in the state.
2523     !!!next-input-character;
2524     redo A;
2525     } elsif ($self->{nc} == 0x003B) { # ;
2526     !!!cp (1006);
2527     !!!next-input-character;
2528     #
2529     } else {
2530     !!!cp (1007);
2531     !!!parse-error (type => 'no refc',
2532     line => $self->{line},
2533     column => $self->{column});
2534     ## Reconsume.
2535     #
2536     }
2537    
2538     my $code = $self->{s_kwd};
2539     my $l = $self->{line_prev};
2540     my $c = $self->{column_prev};
2541     if ($charref_map->{$code}) {
2542     !!!cp (1008);
2543     !!!parse-error (type => 'invalid character reference',
2544     text => (sprintf 'U+%04X', $code),
2545     line => $l, column => $c);
2546     $code = $charref_map->{$code};
2547     } elsif ($code > 0x10FFFF) {
2548     !!!cp (1009);
2549     !!!parse-error (type => 'invalid character reference',
2550     text => (sprintf 'U-%08X', $code),
2551     line => $l, column => $c);
2552     $code = 0xFFFD;
2553     }
2554    
2555     if ($self->{prev_state} == DATA_STATE) {
2556     !!!cp (988);
2557     $self->{state} = $self->{prev_state};
2558     ## Reconsume.
2559     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2560     line => $l, column => $c,
2561     });
2562     redo A;
2563     } else {
2564     !!!cp (987);
2565     $self->{ca}->{value} .= chr $code;
2566     $self->{ca}->{has_reference} = 1;
2567     $self->{state} = $self->{prev_state};
2568     ## Reconsume.
2569     redo A;
2570     }
2571     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2572     if (length $self->{s_kwd} < 30 and
2573     ## NOTE: Some number greater than the maximum length of entity name
2574     ((0x0041 <= $self->{nc} and # a
2575     $self->{nc} <= 0x005A) or # x
2576     (0x0061 <= $self->{nc} and # a
2577     $self->{nc} <= 0x007A) or # z
2578     (0x0030 <= $self->{nc} and # 0
2579     $self->{nc} <= 0x0039) or # 9
2580     $self->{nc} == 0x003B)) { # ;
2581     our $EntityChar;
2582     $self->{s_kwd} .= chr $self->{nc};
2583     if (defined $EntityChar->{$self->{s_kwd}}) {
2584     if ($self->{nc} == 0x003B) { # ;
2585     !!!cp (1020);
2586     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2587     $self->{entity__match} = 1;
2588     !!!next-input-character;
2589     #
2590     } else {
2591     !!!cp (1021);
2592     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2593     $self->{entity__match} = -1;
2594     ## Stay in the state.
2595     !!!next-input-character;
2596     redo A;
2597     }
2598     } else {
2599     !!!cp (1022);
2600     $self->{entity__value} .= chr $self->{nc};
2601     $self->{entity__match} *= 2;
2602     ## Stay in the state.
2603     !!!next-input-character;
2604     redo A;
2605     }
2606     }
2607    
2608     my $data;
2609     my $has_ref;
2610     if ($self->{entity__match} > 0) {
2611     !!!cp (1023);
2612     $data = $self->{entity__value};
2613     $has_ref = 1;
2614     #
2615     } elsif ($self->{entity__match} < 0) {
2616     !!!parse-error (type => 'no refc');
2617     if ($self->{prev_state} != DATA_STATE and # in attribute
2618     $self->{entity__match} < -1) {
2619     !!!cp (1024);
2620     $data = '&' . $self->{s_kwd};
2621     #
2622     } else {
2623     !!!cp (1025);
2624     $data = $self->{entity__value};
2625     $has_ref = 1;
2626     #
2627     }
2628     } else {
2629     !!!cp (1026);
2630     !!!parse-error (type => 'bare ero',
2631     line => $self->{line_prev},
2632     column => $self->{column_prev} - length $self->{s_kwd});
2633     $data = '&' . $self->{s_kwd};
2634     #
2635     }
2636    
2637     ## NOTE: In these cases, when a character reference is found,
2638     ## it is consumed and a character token is returned, or, otherwise,
2639     ## nothing is consumed and returned, according to the spec algorithm.
2640     ## In this implementation, anything that has been examined by the
2641     ## tokenizer is appended to the parent element or the attribute value
2642     ## as string, either literal string when no character reference or
2643     ## entity-replaced string otherwise, in this stage, since any characters
2644     ## that would not be consumed are appended in the data state or in an
2645     ## appropriate attribute value state anyway.
2646    
2647     if ($self->{prev_state} == DATA_STATE) {
2648     !!!cp (986);
2649     $self->{state} = $self->{prev_state};
2650     ## Reconsume.
2651     !!!emit ({type => CHARACTER_TOKEN,
2652     data => $data,
2653     line => $self->{line_prev},
2654     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2655     });
2656     redo A;
2657     } else {
2658     !!!cp (985);
2659     $self->{ca}->{value} .= $data;
2660     $self->{ca}->{has_reference} = 1 if $has_ref;
2661     $self->{state} = $self->{prev_state};
2662     ## Reconsume.
2663     redo A;
2664     }
2665     } else {
2666     die "$0: $self->{state}: Unknown state";
2667     }
2668     } # A
2669    
2670     die "$0: _get_next_token: unexpected case";
2671     } # _get_next_token
2672    
2673     1;
2674 wakaba 1.4 ## $Date: 2008/10/14 05:34:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24