/[suikacvs]/markup/html/whatpm/What/HTML.pm.src
Suika

Contents of /markup/html/whatpm/What/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download) (as text)
Sat Apr 28 14:29:01 2007 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
File MIME type: application/x-wais-source
++ whatpm/What/ChangeLog	28 Apr 2007 14:28:15 -0000
2007-04-28  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src, HTML-consume-entity.src: New files.

	* Makefile (HTML.pm): New rule.

	* mkhtmlparser.pl: New script.

1 wakaba 1.1 package What::HTML;
2     use strict;
3     our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5     ## This is a very, very early version of an HTML parser.
6    
7     my $permitted_slash_tag_name = {
8     base => 1,
9     link => 1,
10     meta => 1,
11     hr => 1,
12     br => 1,
13     img=> 1,
14     embed => 1,
15     param => 1,
16     area => 1,
17     col => 1,
18     input => 1,
19     };
20    
21     sub new ($) {
22     my $class = shift;
23     my $self = bless {}, $class;
24     $self->{set_next_input_character} = sub {
25     $self->{next_input_character} = -1;
26     };
27     $self->{parse_error} = sub {
28     #
29     };
30     return $self;
31     } # new
32    
33     ## Implementations MUST act as if state machine in the spec
34    
35     sub _initialize_tokenizer ($) {
36     my $self = shift;
37     $self->{state} = 'data'; # MUST
38     $self->{content_model_flag} = 'PCDATA'; # be
39     undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
40     undef $self->{current_attribute};
41     undef $self->{last_emitted_start_tag_name};
42     undef $self->{last_attribute_value_state};
43     $self->{char} = [];
44     # $self->{next_input_character}
45     !!!next-input-character;
46     $self->{token} = [];
47     } # _initialize_tokenizer
48    
49     ## A token has:
50     ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
51     ## 'character', or 'end-of-file'
52     ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
53     ## ISSUE: the spec need s/tagname/tag name/
54     ## ->{error} == 1 or 0 (DOCTYPE)
55     ## ->{attributes} isa HASH (start tag, end tag)
56     ## ->{data} (comment, character)
57    
58     ## Macros
59     ## Macros MUST be preceded by three EXCLAMATION MARKs.
60     ## emit ($token)
61     ## Emits the specified token.
62    
63     ## Emitted token MUST immediately be handled by the tree construction state.
64    
65     ## Before each step, UA MAY check to see if either one of the scripts in
66     ## "list of scripts that will execute as soon as possible" or the first
67     ## script in the "list of scripts that will execute asynchronously",
68     ## has completed loading. If one has, then it MUST be executed
69     ## and removed from the list.
70    
71     sub _get_next_token ($) {
72     my $self = shift;
73     if (@{$self->{token}}) {
74     return shift @{$self->{token}};
75     }
76    
77     A: {
78     if ($self->{state} eq 'data') {
79     if ($self->{next_input_character} == 0x0026) { # &
80     if ($self->{content_model_flag} eq 'PCDATA' or
81     $self->{content_model_flag} eq 'RCDATA') {
82     $self->{state} = 'entity data';
83     !!!next-input-character;
84     redo A;
85     } else {
86     #
87     }
88     } elsif ($self->{next_input_character} == 0x003C) { # <
89     if ($self->{content_model_flag} ne 'PLAINTEXT') {
90     $self->{state} = 'tag open';
91     !!!next-input-character;
92     redo A;
93     } else {
94     #
95     }
96     } elsif ($self->{next_input_character} == -1) {
97     !!!emit ({type => 'end-of-file'});
98     last A; ## TODO: ok?
99     }
100     # Anything else
101     my $token = {type => 'character',
102     data => chr $self->{next_input_character}};
103     ## Stay in the data state
104     !!!next-input-character;
105    
106     !!!emit ($token);
107    
108     redo A;
109     } elsif ($self->{state} eq 'entity data') {
110     ## (cannot happen in CDATA state)
111    
112     my $token = $self->_tokenize_attempt_to_consume_an_entity;
113    
114     $self->{state} = 'data';
115     # next-input-character is already done
116    
117     unless (defined $token) {
118     !!!emit ({type => 'character', data => '&'});
119     } else {
120     !!!emit ($token);
121     }
122    
123     redo A;
124     } elsif ($self->{state} eq 'tag open') {
125     if ($self->{content_model_flag} eq 'RCDATA' or
126     $self->{content_model_flag} eq 'CDATA') {
127     if ($self->{next_input_character} == 0x002F) { # /
128     !!!next-input-character;
129     $self->{state} = 'close tag open';
130     redo A;
131     } else {
132     ## reconsume
133     $self->{state} = 'data';
134    
135     !!!emit (type => 'character', data => {'/'});
136    
137     redo A;
138     }
139     } elsif ($self->{content_model_flag} eq 'PCDATA') {
140     if ($self->{next_input_character} == 0x0021) { # !
141     $self->{state} = 'markup declaration open';
142     !!!next-input-character;
143     redo A;
144     } elsif ($self->{next_input_character} == 0x002F) { # /
145     $self->{state} = 'close tag open';
146     !!!next-input-character;
147     redo A;
148     } elsif (0x0041 <= $self->{next_input_character} and
149     $self->{next_input_character} <= 0x005A) { # A..Z
150     $self->{current_token}
151     = {type => 'start tag',
152     tag_name => chr ($self->{next_input_character} + 0x0020)};
153     $self->{state} = 'tag name';
154     !!!next-input-character;
155     redo A;
156     } elsif (0x0061 <= $self->{next_input_character} and
157     $self->{next_input_character} <= 0x007A) { # a..z
158     $self->{current_token} = {type => 'start tag',
159     tag_name => chr ($self->{next_input_character})};
160     $self->{state} = 'tag name';
161     !!!next-input-character;
162     redo A;
163     } elsif ($self->{next_input_character} == 0x003E) { # >
164     !!!parse-error;
165     $self->{state} = 'data';
166     !!!next-input-character;
167    
168     !!!emit ({type => 'character', data => '>'});
169    
170     redo A;
171     } elsif ($self->{next_input_character} == 0x003F) { # ?
172     !!!parse-error;
173     $self->{state} = 'bogus comment';
174     ## $self->{next_input_character} is intentionally left as is
175     redo A;
176     } else {
177     !!!parse-error;
178     $self->{state} = 'data';
179     ## reconsume
180    
181     !!!emit ({type => 'character', data => '<'});
182    
183     redo A;
184     }
185     } else {
186     die "$0: $self->{content_model_flag}: Unknown content model flag";
187     }
188     } elsif ($self->{state} eq 'close tag open') {
189     if ($self->{content_model_flag} eq 'RCDATA' or
190     $self->{content_model_flag} eq 'CDATA') {
191     my @next_char;
192     TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
193     push @next_char, $self->{next_input_character};
194     my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
195     my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
196     if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
197     !!!next-input-character;
198     next TAGNAME;
199     } else {
200     !!!parse-error;
201     $self->{next_input_character} = shift @next_char; # reconsume
202     !!!back-next-input-character (@next_char);
203     $self->{state} = 'data';
204    
205     !!!emit ({type => 'character', data => '</'});
206    
207     redo A;
208     }
209     }
210    
211     unless ($self->{next_input_character} == 0x0009 or
212     $self->{next_input_character} == 0x000A or
213     $self->{next_input_character} == 0x000B or
214     $self->{next_input_character} == 0x000C or
215     $self->{next_input_character} == 0x0020 or
216     $self->{next_input_character} == 0x003E or
217     $self->{next_input_character} == 0x002F or
218     $self->{next_input_character} == 0x003C or
219     $self->{next_input_character} == -1) {
220     !!!parse-error;
221     $self->{next_input_character} = shift @next_char; # reconsume
222     !!!back-next-input-character (@next_char);
223     $self->{state} = 'data';
224    
225     !!!emit ({type => 'character', data => '</'});
226    
227     redo A;
228     } else {
229     $self->{next_input_character} = shift @next_char;
230     !!!back-next-input-character (@next_char);
231     # and consume...
232     }
233     }
234    
235     if (0x0041 <= $self->{next_input_character} and
236     $self->{next_input_character} <= 0x005A) { # A..Z
237     $self->{current_token} = {type => 'end tag',
238     tag_name => chr ($self->{next_input_character} + 0x0020)};
239     $self->{state} = 'tag name';
240     !!!next-input-character;
241     redo A;
242     } elsif (0x0061 <= $self->{next_input_character} and
243     $self->{next_input_character} <= 0x007A) { # a..z
244     $self->{current_token} = {type => 'end tag',
245     tag_name => chr ($self->{next_input_character})};
246     $self->{state} = 'tag name';
247     !!!next-input-character;
248     redo A;
249     } elsif ($self->{next_input_character} == 0x003E) { # >
250     !!!parse-error;
251     $self->{state} = 'data';
252     !!!next-input-character;
253     redo A;
254     } elsif ($self->{next_input_character} == -1) {
255     !!!parse-error;
256     $self->{state} = 'data';
257     # reconsume
258    
259     !!!emit ({type => 'character', data => '</'});
260    
261     redo A;
262     } else {
263     !!!parse-error;
264     $self->{state} = 'bogus comment';
265     ## $self->{next_input_character} is intentionally left as is
266     redo A;
267     }
268     } elsif ($self->{state} eq 'tag name') {
269     if ($self->{next_input_character} == 0x0009 or # HT
270     $self->{next_input_character} == 0x000A or # LF
271     $self->{next_input_character} == 0x000B or # VT
272     $self->{next_input_character} == 0x000C or # FF
273     $self->{next_input_character} == 0x0020) { # SP
274     $self->{state} = 'before attribute name';
275     !!!next-input-character;
276     redo A;
277     } elsif ($self->{next_input_character} == 0x003E) { # >
278     if ($self->{current_token}->{type} eq 'start tag') {
279     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
280     } elsif ($self->{current_token}->{type} eq 'end tag') {
281     $self->{content_model_flag} = 'PCDATA'; # MUST
282     if ($self->{current_token}->{attribute}) {
283     !!!parse-error;
284     }
285     } else {
286     die "$0: $self->{current_token}->{type}: Unknown token type";
287     }
288     $self->{state} = 'data';
289     !!!next-input-character;
290    
291     !!!emit ($self->{current_token}); # start tag or end tag
292     undef $self->{current_token};
293    
294     redo A;
295     } elsif (0x0041 <= $self->{next_input_character} and
296     $self->{next_input_character} <= 0x005A) { # A..Z
297     $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
298     # start tag or end tag
299     ## Stay in this state
300     !!!next-input-character;
301     redo A;
302     } elsif ($self->{next_input_character} == 0x003C or # <
303     $self->{next_input_character} == -1) {
304     !!!parse-error;
305     if ($self->{current_token}->{type} eq 'start tag') {
306     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
307     } elsif ($self->{current_token}->{type} eq 'end tag') {
308     $self->{content_model_flag} = 'PCDATA'; # MUST
309     if ($self->{current_token}->{attribute}) {
310     !!!parse-error;
311     }
312     } else {
313     die "$0: $self->{current_token}->{type}: Unknown token type";
314     }
315     $self->{state} = 'data';
316     # reconsume
317    
318     !!!emit ($self->{current_token}); # start tag or end tag
319     undef $self->{current_token};
320    
321     redo A;
322     } elsif ($self->{next_input_character} == 0x002F) { # /
323     !!!next-input-character;
324     if ($self->{next_input_character} == 0x003E and # >
325     $self->{current_token}->{type} eq 'start tag' and
326     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
327     # permitted slash
328     #
329     } else {
330     !!!parse-error;
331     }
332     $self->{state} = 'before attribute name';
333     # next-input-character is already done
334     redo A;
335     } else {
336     $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
337     # start tag or end tag
338     ## Stay in the state
339     !!!next-input-character;
340     redo A;
341     }
342     } elsif ($self->{state} eq 'before attribute name') {
343     if ($self->{next_input_character} == 0x0009 or # HT
344     $self->{next_input_character} == 0x000A or # LF
345     $self->{next_input_character} == 0x000B or # VT
346     $self->{next_input_character} == 0x000C or # FF
347     $self->{next_input_character} == 0x0020) { # SP
348     ## Stay in the state
349     !!!next-input-character;
350     redo A;
351     } elsif ($self->{next_input_character} == 0x003E) { # >
352     if ($self->{current_token}->{type} eq 'start tag') {
353     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
354     } elsif ($self->{current_token}->{type} eq 'end tag') {
355     $self->{content_model_flag} = 'PCDATA'; # MUST
356     if ($self->{current_token}->{attribute}) {
357     !!!parse-error;
358     }
359     } else {
360     die "$0: $self->{current_token}->{type}: Unknown token type";
361     }
362     $self->{state} = 'data';
363     !!!next-input-character;
364    
365     !!!emit ($self->{current_token}); # start tag or end tag
366     undef $self->{current_token};
367    
368     redo A;
369     } elsif (0x0041 <= $self->{next_input_character} and
370     $self->{next_input_character} <= 0x005A) { # A..Z
371     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
372     value => ''};
373     $self->{state} = 'attribute name';
374     !!!next-input-character;
375     redo A;
376     } elsif ($self->{next_input_character} == 0x002F) { # /
377     !!!next-input-character;
378     if ($self->{next_input_character} == 0x003E and # >
379     $self->{current_token}->{type} eq 'start tag' and
380     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
381     # permitted slash
382     #
383     } else {
384     !!!parse-error;
385     }
386     ## Stay in the state
387     # next-input-character is already done
388     redo A;
389     } elsif ($self->{next_input_character} == 0x003C or # <
390     $self->{next_input_character} == -1) {
391     !!!parse-error;
392     if ($self->{current_token}->{type} eq 'start tag') {
393     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
394     } elsif ($self->{current_token}->{type} eq 'end tag') {
395     $self->{content_model_flag} = 'PCDATA'; # MUST
396     if ($self->{current_token}->{attribute}) {
397     !!!parse-error;
398     }
399     } else {
400     die "$0: $self->{current_token}->{type}: Unknown token type";
401     }
402     $self->{state} = 'data';
403     # reconsume
404    
405     !!!emit ($self->{current_token}); # start tag or end tag
406     undef $self->{current_token};
407    
408     redo A;
409     } else {
410     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
411     value => ''};
412     $self->{state} = 'attribute name';
413     !!!next-input-character;
414     redo A;
415     }
416     } elsif ($self->{state} eq 'attribute name') {
417     my $before_leave = sub {
418     if (exists $self->{current_token}->{attribute} # start tag or end tag
419     ->{$self->{current_attribute}->{name}}) { # MUST
420     !!!parse-error;
421     ## Discard $self->{current_attribute} # MUST
422     } else {
423     $self->{current_token}->{attribute}->{$self->{current_attribute}->{name}}
424     = $self->{current_attribute};
425     }
426     undef $self->{current_attribute};
427     }; # $before_leave
428    
429     if ($self->{next_input_character} == 0x0009 or # HT
430     $self->{next_input_character} == 0x000A or # LF
431     $self->{next_input_character} == 0x000B or # VT
432     $self->{next_input_character} == 0x000C or # FF
433     $self->{next_input_character} == 0x0020) { # SP
434     $before_leave->();
435     $self->{state} = 'after attribute name';
436     !!!next-input-character;
437     redo A;
438     } elsif ($self->{next_input_character} == 0x003D) { # =
439     $before_leave->();
440     $self->{state} = 'before attribute value';
441     !!!next-input-character;
442     redo A;
443     } elsif ($self->{next_input_character} == 0x003E) { # >
444     $before_leave->();
445     if ($self->{current_token}->{type} eq 'start tag') {
446     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
447     } elsif ($self->{current_token}->{type} eq 'end tag') {
448     $self->{content_model_flag} = 'PCDATA'; # MUST
449     if ($self->{current_token}->{attribute}) {
450     !!!parse-error;
451     }
452     } else {
453     die "$0: $self->{current_token}->{type}: Unknown token type";
454     }
455     $self->{state} = 'data';
456     !!!next-input-character;
457    
458     !!!emit ($self->{current_token}); # start tag or end tag
459     undef $self->{current_token};
460    
461     redo A;
462     } elsif (0x0041 <= $self->{next_input_character} and
463     $self->{next_input_character} <= 0x005A) { # A..Z
464     $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
465     ## Stay in the state
466     !!!next-input-character;
467     redo A;
468     } elsif ($self->{next_input_character} == 0x002F) { # /
469     $before_leave->();
470     !!!next-input-character;
471     if ($self->{next_input_character} == 0x003E and # >
472     $self->{current_token}->{type} eq 'start tag' and
473     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
474     # permitted slash
475     #
476     } else {
477     !!!parse-error;
478     }
479     $self->{state} = 'before attribute name';
480     # next-input-character is already done
481     redo A;
482     } elsif ($self->{next_input_character} == 0x003C or # <
483     $self->{next_input_character} == -1) {
484     !!!parse-error;
485     $before_leave->();
486     if ($self->{current_token}->{type} eq 'start tag') {
487     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
488     } elsif ($self->{current_token}->{type} eq 'end tag') {
489     $self->{content_model_flag} = 'PCDATA'; # MUST
490     if ($self->{current_token}->{attribute}) {
491     !!!parse-error;
492     }
493     } else {
494     die "$0: $self->{current_token}->{type}: Unknown token type";
495     }
496     $self->{state} = 'data';
497     # reconsume
498    
499     !!!emit ($self->{current_token}); # start tag or end tag
500     undef $self->{current_token};
501    
502     redo A;
503     } else {
504     $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
505     ## Stay in the state
506     !!!next-input-character;
507     redo A;
508     }
509     } elsif ($self->{state} eq 'after attribute name') {
510     if ($self->{next_input_character} == 0x0009 or # HT
511     $self->{next_input_character} == 0x000A or # LF
512     $self->{next_input_character} == 0x000B or # VT
513     $self->{next_input_character} == 0x000C or # FF
514     $self->{next_input_character} == 0x0020) { # SP
515     ## Stay in the state
516     !!!next-input-character;
517     redo A;
518     } elsif ($self->{next_input_character} == 0x003D) { # =
519     $self->{state} = 'before attribute value';
520     !!!next-input-character;
521     redo A;
522     } elsif ($self->{next_input_character} == 0x003E) { # >
523     if ($self->{current_token}->{type} eq 'start tag') {
524     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
525     } elsif ($self->{current_token}->{type} eq 'end tag') {
526     $self->{content_model_flag} = 'PCDATA'; # MUST
527     if ($self->{current_token}->{attribute}) {
528     !!!parse-error;
529     }
530     } else {
531     die "$0: $self->{current_token}->{type}: Unknown token type";
532     }
533     $self->{state} = 'data';
534     !!!next-input-character;
535    
536     !!!emit ($self->{current_token}); # start tag or end tag
537     undef $self->{current_token};
538    
539     redo A;
540     } elsif (0x0041 <= $self->{next_input_character} and
541     $self->{next_input_character} <= 0x005A) { # A..Z
542     $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
543     value => ''};
544     $self->{state} = 'attribute name';
545     !!!next-input-character;
546     redo A;
547     } elsif ($self->{next_input_character} == 0x002F) { # /
548     !!!next-input-character;
549     if ($self->{next_input_character} == 0x003E and # >
550     $self->{current_token}->{type} eq 'start tag' and
551     $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
552     # permitted slash
553     #
554     } else {
555     !!!parse-error;
556     }
557     $self->{state} = 'before attribute name';
558     # next-input-character is already done
559     redo A;
560     } elsif ($self->{next_input_character} == 0x003C or # <
561     $self->{next_input_character} == -1) {
562     !!!parse-error;
563     if ($self->{current_token}->{type} eq 'start tag') {
564     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
565     } elsif ($self->{current_token}->{type} eq 'end tag') {
566     $self->{content_model_flag} = 'PCDATA'; # MUST
567     if ($self->{current_token}->{attribute}) {
568     !!!parse-error;
569     }
570     } else {
571     die "$0: $self->{current_token}->{type}: Unknown token type";
572     }
573     $self->{state} = 'data';
574     # reconsume
575    
576     !!!emit ($self->{current_token}); # start tag or end tag
577     undef $self->{current_token};
578    
579     redo A;
580     } else {
581     $self->{current_attribute} = {name => chr ($self->{next_input_character}),
582     value => ''};
583     $self->{state} = 'attribute name';
584     !!!next-input-character;
585     redo A;
586     }
587     } elsif ($self->{state} eq 'before attribute value') {
588     if ($self->{next_input_character} == 0x0009 or # HT
589     $self->{next_input_character} == 0x000A or # LF
590     $self->{next_input_character} == 0x000B or # VT
591     $self->{next_input_character} == 0x000C or # FF
592     $self->{next_input_character} == 0x0020) { # SP
593     ## Stay in the state
594     !!!next-input-character;
595     redo A;
596     } elsif ($self->{next_input_character} == 0x0022) { # "
597     $self->{state} = 'attribute value (double-quoted)';
598     !!!next-input-character;
599     redo A;
600     } elsif ($self->{next_input_character} == 0x0026) { # &
601     $self->{state} = 'attribute value (unquoted)';
602     ## reconsume
603     redo A;
604     } elsif ($self->{next_input_character} == 0x0027) { # '
605     $self->{state} = 'attribute value (single-quoted)';
606     !!!next-input-character;
607     redo A;
608     } elsif ($self->{next_input_character} == 0x003E) { # >
609     if ($self->{current_token}->{type} eq 'start tag') {
610     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
611     } elsif ($self->{current_token}->{type} eq 'end tag') {
612     $self->{content_model_flag} = 'PCDATA'; # MUST
613     if ($self->{current_token}->{attribute}) {
614     !!!parse-error;
615     }
616     } else {
617     die "$0: $self->{current_token}->{type}: Unknown token type";
618     }
619     $self->{state} = 'data';
620     !!!next-input-character;
621    
622     !!!emit ($self->{current_token}); # start tag or end tag
623     undef $self->{current_token};
624    
625     redo A;
626     } elsif ($self->{next_input_character} == 0x003C or # <
627     $self->{next_input_character} == -1) {
628     !!!parse-error;
629     if ($self->{current_token}->{type} eq 'start tag') {
630     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
631     } elsif ($self->{current_token}->{type} eq 'end tag') {
632     $self->{content_model_flag} = 'PCDATA'; # MUST
633     if ($self->{current_token}->{attribute}) {
634     !!!parse-error;
635     }
636     } else {
637     die "$0: $self->{current_token}->{type}: Unknown token type";
638     }
639     $self->{state} = 'data';
640     ## reconsume
641    
642     !!!emit ($self->{current_token}); # start tag or end tag
643     undef $self->{current_token};
644    
645     redo A;
646     } else {
647     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
648     $self->{state} = 'attribute value (unquoted)';
649     !!!next-input-character;
650     redo A;
651     }
652     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
653     if ($self->{next_input_character} == 0x0022) { # "
654     $self->{state} = 'before attribute name';
655     !!!next-input-character;
656     redo A;
657     } elsif ($self->{next_input_character} == 0x0026) { # &
658     $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
659     $self->{state} = 'entity in attribute value';
660     !!!next-input-character;
661     redo A;
662     } elsif ($self->{next_input_character} == -1) {
663     !!!parse-error;
664     if ($self->{current_token}->{type} eq 'start tag') {
665     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
666     } elsif ($self->{current_token}->{type} eq 'end tag') {
667     $self->{content_model_flag} = 'PCDATA'; # MUST
668     if ($self->{current_token}->{attribute}) {
669     !!!parse-error;
670     }
671     } else {
672     die "$0: $self->{current_token}->{type}: Unknown token type";
673     }
674     $self->{state} = 'data';
675     ## reconsume
676    
677     !!!emit ($self->{current_token}); # start tag or end tag
678     undef $self->{current_token};
679    
680     redo A;
681     } else {
682     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
683     ## Stay in the state
684     !!!next-input-character;
685     redo A;
686     }
687     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
688     if ($self->{next_input_character} == 0x0027) { # '
689     $self->{state} = 'before attribute name';
690     !!!next-input-character;
691     redo A;
692     } elsif ($self->{next_input_character} == 0x0026) { # &
693     $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
694     $self->{state} = 'entity in attribute value';
695     !!!next-input-character;
696     redo A;
697     } elsif ($self->{next_input_character} == -1) {
698     !!!parse-error;
699     if ($self->{current_token}->{type} eq 'start tag') {
700     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
701     } elsif ($self->{current_token}->{type} eq 'end tag') {
702     $self->{content_model_flag} = 'PCDATA'; # MUST
703     if ($self->{current_token}->{attribute}) {
704     !!!parse-error;
705     }
706     } else {
707     die "$0: $self->{current_token}->{type}: Unknown token type";
708     }
709     $self->{state} = 'data';
710     ## reconsume
711    
712     !!!emit ($self->{current_token}); # start tag or end tag
713     undef $self->{current_token};
714    
715     redo A;
716     } else {
717     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
718     ## Stay in the state
719     !!!next-input-character;
720     redo A;
721     }
722     } elsif ($self->{state} eq 'attribute value (unquoted)') {
723     if ($self->{next_input_character} == 0x0009 or # HT
724     $self->{next_input_character} == 0x000A or # LF
725     $self->{next_input_character} == 0x000B or # HT
726     $self->{next_input_character} == 0x000C or # FF
727     $self->{next_input_character} == 0x0020) { # SP
728     $self->{state} = 'before attribute name';
729     !!!next-input-character;
730     redo A;
731     } elsif ($self->{next_input_character} == 0x0026) { # &
732     $self->{last_attribute_value_state} = 'attribute value (unquoted)';
733     $self->{state} = 'entity in attribute value';
734     !!!next-input-character;
735     redo A;
736     } elsif ($self->{next_input_character} == 0x003E) { # >
737     if ($self->{current_token}->{type} eq 'start tag') {
738     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
739     } elsif ($self->{current_token}->{type} eq 'end tag') {
740     $self->{content_model_flag} = 'PCDATA'; # MUST
741     if ($self->{current_token}->{attribute}) {
742     !!!parse-error;
743     }
744     } else {
745     die "$0: $self->{current_token}->{type}: Unknown token type";
746     }
747     $self->{state} = 'data';
748     !!!next-input-character;
749    
750     !!!emit ($self->{current_token}); # start tag or end tag
751     undef $self->{current_token};
752    
753     redo A;
754     } elsif ($self->{next_input_character} == 0x003C or # <
755     $self->{next_input_character} == -1) {
756     !!!parse-error;
757     if ($self->{current_token}->{type} eq 'start tag') {
758     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
759     } elsif ($self->{current_token}->{type} eq 'end tag') {
760     $self->{content_model_flag} = 'PCDATA'; # MUST
761     if ($self->{current_token}->{attribute}) {
762     !!!parse-error;
763     }
764     } else {
765     die "$0: $self->{current_token}->{type}: Unknown token type";
766     }
767     $self->{state} = 'data';
768     ## reconsume
769    
770     !!!emit ($self->{current_token}); # start tag or end tag
771     undef $self->{current_token};
772    
773     redo A;
774     } else {
775     $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
776     ## Stay in the state
777     !!!next-input-character;
778     redo A;
779     }
780     } elsif ($self->{state} eq 'entity in attribute value') {
781     my $token = $self->_tokenize_attempt_to_consume_an_entity;
782    
783     unless (defined $token) {
784     $self->{current_attribute}->{value} .= '&';
785     } else {
786     $self->{current_attribute}->{value} .= $token->{data};
787     ## ISSUE: spec says "append the returned character token to the current attribute's value"
788     }
789    
790     $self->{state} = $self->{last_attribute_value_state};
791     # next-input-character is already done
792     redo A;
793     } elsif ($self->{state} eq 'bogus comment') {
794     ## (only happen if PCDATA state)
795    
796     my $token = {type => 'comment', data => ''};
797    
798     BC: {
799     if ($self->{next_input_character} == 0x003E) { # >
800     $self->{state} = 'data';
801     !!!next-input-character;
802    
803     !!!emit ($token);
804    
805     redo A;
806     } elsif ($self->{next_input_character} == -1) {
807     $self->{state} = 'data';
808     ## reconsume
809    
810     !!!emit ($token);
811    
812     redo A;
813     } else {
814     $token->{data} .= chr ($self->{next_input_character});
815     !!!next-input-character;
816     redo BC;
817     }
818     } # BC
819     } elsif ($self->{state} eq 'markup declaration open') {
820     ## (only happen if PCDATA state)
821    
822     my @next_char;
823     push @next_char, $self->{next_input_character};
824    
825     if ($self->{next_input_character} == 0x002D) { # -
826     !!!next-input-character;
827     push @next_char, $self->{next_input_character};
828     if ($self->{next_input_character} == 0x002D) { # -
829     $self->{current_token} = {type => 'comment', data => ''};
830     $self->{state} = 'comment';
831     !!!next-input-character;
832     redo A;
833     }
834     } elsif ($self->{next_input_character} == 0x0044 or # D
835     $self->{next_input_character} == 0x0064) { # d
836     !!!next-input-character;
837     push @next_char, $self->{next_input_character};
838     if ($self->{next_input_character} == 0x004F or # O
839     $self->{next_input_character} == 0x006F) { # o
840     !!!next-input-character;
841     push @next_char, $self->{next_input_character};
842     if ($self->{next_input_character} == 0x0043 or # C
843     $self->{next_input_character} == 0x0063) { # c
844     !!!next-input-character;
845     push @next_char, $self->{next_input_character};
846     if ($self->{next_input_character} == 0x0054 or # T
847     $self->{next_input_character} == 0x0074) { # t
848     !!!next-input-character;
849     push @next_char, $self->{next_input_character};
850     if ($self->{next_input_character} == 0x0059 or # Y
851     $self->{next_input_character} == 0x0079) { # y
852     !!!next-input-character;
853     push @next_char, $self->{next_input_character};
854     if ($self->{next_input_character} == 0x0050 or # P
855     $self->{next_input_character} == 0x0070) { # p
856     !!!next-input-character;
857     push @next_char, $self->{next_input_character};
858     if ($self->{next_input_character} == 0x0045 or # E
859     $self->{next_input_character} == 0x0065) { # e
860     ## ISSUE: What a stupid code this is!
861     $self->{state} = 'DOCTYPE';
862     !!!next-input-character;
863     redo A;
864     }
865     }
866     }
867     }
868     }
869     }
870     }
871    
872     !!!parse-error;
873     $self->{next_input_character} = shift @next_char;
874     !!!back-next-input-character (@next_char);
875     $self->{state} = 'bogus comment';
876     redo A;
877    
878     ## ISSUE: typos in spec: chacacters, is is a parse error
879     ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
880     } elsif ($self->{state} eq 'comment') {
881     if ($self->{next_input_character} == 0x002D) { # -
882     $self->{state} = 'comment dash';
883     !!!next-input-character;
884     redo A;
885     } elsif ($self->{next_input_character} == -1) {
886     !!!parse-error;
887     $self->{state} = 'data';
888     ## reconsume
889    
890     !!!emit ($self->{current_token}); # comment
891     undef $self->{current_token};
892    
893     redo A;
894     } else {
895     $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
896     ## Stay in the state
897     !!!next-input-character;
898     redo A;
899     }
900     } elsif ($self->{state} eq 'comment dash') {
901     if ($self->{next_input_character} == 0x002D) { # -
902     $self->{state} = 'comment end';
903     !!!next-input-character;
904     redo A;
905     } elsif ($self->{next_input_character} == -1) {
906     !!!parse-error;
907     $self->{state} = 'data';
908     ## reconsume
909    
910     !!!emit ($self->{current_token}); # comment
911     undef $self->{current_token};
912    
913     redo A;
914     } else {
915     $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
916     $self->{state} = 'comment';
917     !!!next-input-character;
918     redo A;
919     }
920     } elsif ($self->{state} eq 'comment end') {
921     if ($self->{next_input_character} == 0x003E) { # >
922     $self->{state} = 'data';
923     !!!next-input-character;
924    
925     !!!emit ($self->{current_token}); # comment
926     undef $self->{current_token};
927    
928     redo A;
929     } elsif ($self->{next_input_character} == 0x002D) { # -
930     !!!parse-error;
931     $self->{current_token}->{data} .= '-'; # comment
932     ## Stay in the state
933     !!!next-input-character;
934     redo A;
935     } elsif ($self->{next_input_character} == -1) {
936     !!!parse-error;
937     $self->{state} = 'data';
938     ## reconsume
939    
940     !!!emit ($self->{current_token}); # comment
941     undef $self->{current_token};
942    
943     redo A;
944     } else {
945     !!!parse-error;
946     $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
947     $self->{state} = 'comment';
948     !!!next-input-character;
949     redo A;
950     }
951     } elsif ($self->{state} eq 'DOCTYPE') {
952     if ($self->{next_input_character} == 0x0009 or # HT
953     $self->{next_input_character} == 0x000A or # LF
954     $self->{next_input_character} == 0x000B or # VT
955     $self->{next_input_character} == 0x000C or # FF
956     $self->{next_input_character} == 0x0020) { # SP
957     $self->{state} = 'before DOCTYPE name';
958     !!!next-input-character;
959     redo A;
960     } else {
961     !!!parse-error;
962     $self->{state} = 'before DOCTYPE name';
963     ## reconsume
964     redo A;
965     }
966     } elsif ($self->{state} eq 'before DOCTYPE name') {
967     if ($self->{next_input_character} == 0x0009 or # HT
968     $self->{next_input_character} == 0x000A or # LF
969     $self->{next_input_character} == 0x000B or # VT
970     $self->{next_input_character} == 0x000C or # FF
971     $self->{next_input_character} == 0x0020) { # SP
972     ## Stay in the state
973     !!!next-input-character;
974     redo A;
975     } elsif (0x0061 <= $self->{next_input_character} and
976     $self->{next_input_character} <= 0x007A) { # a..z
977     $self->{current_token} = {type => 'DOCTYPE',
978     name => chr ($self->{next_input_character} - 0x0020),
979     error => 1};
980     $self->{state} = 'DOCTYPE name';
981     !!!next-input-character;
982     redo A;
983     } elsif ($self->{next_input_character} == 0x003E) { # >
984     !!!parse-error;
985     $self->{state} = 'data';
986     !!!next-input-character;
987    
988     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
989    
990     redo A;
991     } elsif ($self->{next_input_character} == -1) {
992     !!!parse-error;
993     $self->{state} = 'data';
994     ## reconsume
995    
996     !!!emit ({type => 'DOCTYPE', name => '', error => 1});
997    
998     redo A;
999     } else {
1000     $self->{current_token} = {type => 'DOCTYPE',
1001     name => chr ($self->{next_input_character}),
1002     error => 1};
1003     $self->{state} = 'DOCTYPE name';
1004     !!!next-input-character;
1005     redo A;
1006     }
1007     } elsif ($self->{state} eq 'DOCTYPE name') {
1008     if ($self->{next_input_character} == 0x0009 or # HT
1009     $self->{next_input_character} == 0x000A or # LF
1010     $self->{next_input_character} == 0x000B or # VT
1011     $self->{next_input_character} == 0x000C or # FF
1012     $self->{next_input_character} == 0x0020) { # SP
1013     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1014     $self->{state} = 'after DOCTYPE name';
1015     !!!next-input-character;
1016     redo A;
1017     } elsif ($self->{next_input_character} == 0x003E) { # >
1018     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1019     $self->{state} = 'data';
1020     !!!next-input-character;
1021    
1022     !!!emit ($self->{current_token}); # DOCTYPE
1023     undef $self->{current_token};
1024    
1025     redo A;
1026     } elsif (0x0061 <= $self->{next_input_character} and
1027     $self->{next_input_character} <= 0x007A) { # a..z
1028     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1029     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1030     ## Stay in the state
1031     !!!next-input-character;
1032     redo A;
1033     } elsif ($self->{next_input_character} == -1) {
1034     !!!parse-error;
1035     $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1036     $self->{state} = 'data';
1037     ## reconsume
1038    
1039     !!!emit ($self->{current_token});
1040     undef $self->{current_token};
1041    
1042     redo A;
1043     } else {
1044     $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1045     #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1046     ## Stay in the state
1047     !!!next-input-character;
1048     redo A;
1049     }
1050     } elsif ($self->{state} eq 'after DOCTYPE name') {
1051     if ($self->{next_input_character} == 0x0009 or # HT
1052     $self->{next_input_character} == 0x000A or # LF
1053     $self->{next_input_character} == 0x000B or # VT
1054     $self->{next_input_character} == 0x000C or # FF
1055     $self->{next_input_character} == 0x0020) { # SP
1056     ## Stay in the state
1057     !!!next-input-character;
1058     redo A;
1059     } elsif ($self->{next_input_character} == 0x003E) { # >
1060     $self->{state} = 'data';
1061     !!!next-input-character;
1062    
1063     !!!emit ($self->{current_token}); # DOCTYPE
1064     undef $self->{current_token};
1065    
1066     redo A;
1067     } elsif ($self->{next_input_character} == -1) {
1068     !!!parse-error;
1069     $self->{state} = 'data';
1070     ## reconsume
1071    
1072     !!!emit ($self->{current_token}); # DOCTYPE
1073     undef $self->{current_token};
1074    
1075     redo A;
1076     } else {
1077     !!!parse-error;
1078     $self->{current_token}->{error} = 1; # DOCTYPE
1079     $self->{state} = 'bogus DOCTYPE';
1080     !!!next-input-character;
1081     redo A;
1082     }
1083     } elsif ($self->{state} eq 'bogus DOCTYPE') {
1084     if ($self->{next_input_character} == 0x003E) { # >
1085     $self->{state} = 'data';
1086     !!!next-input-character;
1087    
1088     !!!emit ($self->{current_token}); # DOCTYPE
1089     undef $self->{current_token};
1090    
1091     redo A;
1092     } elsif ($self->{next_input_character} == -1) {
1093     !!!parse-error;
1094     $self->{state} = 'data';
1095     ## reconsume
1096    
1097     !!!emit ($self->{current_token}); # DOCTYPE
1098     undef $self->{current_token};
1099    
1100     redo A;
1101     } else {
1102     ## Stay in the state
1103     !!!next-input-character;
1104     redo A;
1105     }
1106     } else {
1107     die "$0: $self->{state}: Unknown state";
1108     }
1109     } # A
1110    
1111     die "$0: _get_next_token: unexpected case";
1112     } # _get_next_token
1113    
1114     sub _tokenize_attempt_to_consume_an_entity ($) {
1115     my $self = shift;
1116     my $r;
1117    
1118     if ($self->{next_input_character} == 0x0023) { # #
1119     !!!next-input-character;
1120     my $num;
1121     if ($self->{next_input_character} == 0x0078 or # x
1122     $self->{next_input_character} == 0x0058) { # X
1123     X: {
1124     my $x_char = $self->{next_input_character};
1125     !!!next-input-character;
1126     if (0x0030 <= $self->{next_input_character} and
1127     $self->{next_input_character} <= 0x0039) { # 0..9
1128     $num ||= 0;
1129     $num *= 0x10;
1130     $num += $self->{next_input_character} - 0x0030;
1131     redo X;
1132     } elsif (0x0061 <= $self->{next_input_character} and
1133     $self->{next_input_character} <= 0x0066) { # a..f
1134     ## ISSUE: the spec says U+0078, which is apparently incorrect
1135     $num ||= 0;
1136     $num *= 0x10;
1137     $num += $self->{next_input_character} - 0x0060 + 9;
1138     redo X;
1139     } elsif (0x0041 <= $self->{next_input_character} and
1140     $self->{next_input_character} <= 0x0046) { # A..F
1141     ## ISSUE: the spec says U+0058, which is apparently incorrect
1142     $num ||= 0;
1143     $num *= 0x10;
1144     $num += $self->{next_input_character} - 0x0040 + 9;
1145     redo X;
1146     } elsif (not defined $num) { # no hexadecimal digit
1147     !!!parse-error;
1148     $self->{next_input_character} = 0x0023; # #
1149     !!!back-next-input-character ($x_char);
1150     last X; ## nothing is returned
1151     } elsif ($self->{next_input_character} == 0x003B) { # ;
1152     !!!next-input-character;
1153     } else {
1154     !!!parse-error;
1155     }
1156    
1157     ## TODO: check the definition for |a valid Unicode character|.
1158     if ($num > 1114111 or $num == 0) {
1159     $num = 0xFFFD; # REPLACEMENT CHARACTER
1160     ## ISSUE: Why this is not an error?
1161     }
1162    
1163     $r = {type => 'character', data => chr $num};
1164     } # X
1165     } else {
1166     D: {
1167     if (0x0030 <= $self->{next_input_character} and
1168     $self->{next_input_character} <= 0x0039) { # 0..9
1169     $num *= 10;
1170     $num += $self->{next_input_character} - 0x0030;
1171     !!!next-input-character;
1172     redo D;
1173     } else {
1174     !!!parse-error;
1175     !!!back-next-input-character ($self->{next_input_character});
1176     $self->{next_input_character} = 0x0023; # #
1177     last D; ## nothing is returned
1178     }
1179    
1180     if ($self->{next_input_character} == 0x003B) { # ;
1181     !!!next-input-character;
1182     } else {
1183     !!!parse-error;
1184     }
1185    
1186     ## TODO: check the definition for |a valid Unicode character|.
1187     if ($num > 1114111 or $num == 0) {
1188     $num = 0xFFFD; # REPLACEMENT CHARACTER
1189     ## ISSUE: Why this is not an error?
1190     }
1191    
1192     $r = {type => 'character', data => chr $num};
1193     } # D
1194     }
1195     !!!consume-entity}
1196     return $r;
1197     } # _tokenize_attempt_to_consume_an_entity
1198    
1199     1;
1200     # $Date:$

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24