/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download) (as text)
Tue Oct 14 04:32:50 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
File MIME type: application/x-wais-source
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 04:28:43 -0000
	* Tokenizer.pm.src: Make *_TOKEN (token type constants)
	exportable.  New token types, PI_TOKEN for XML and ABORT_TOKEN for
	document.write() or incremental parsing, are added for future
	extensions.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 04:27:29 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile, Parser.pm.src: New files.

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68     !!!cp ('j1');
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72     !!!cp ('j2');
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82     !!!cp ('j4');
83     !!!parse-error (type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     !!!parse-error (layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182     return $self;
183     } # new
184    
185     sub _initialize_tree_constructor ($) {
186     my $self = shift;
187     ## NOTE: $self->{document} MUST be specified before this method is called
188     $self->{document}->strict_error_checking (0);
189     ## TODO: Turn mutation events off # MUST
190     $self->{document}->dom_config
191     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
192     = 0;
193     $self->{document}->manakai_is_html (0);
194     $self->{document}->set_user_data (manakai_source_line => 1);
195     $self->{document}->set_user_data (manakai_source_column => 1);
196     } # _initialize_tree_constructor
197    
198     sub _terminate_tree_constructor ($) {
199     my $self = shift;
200     $self->{document}->strict_error_checking (1);
201     $self->{document}->dom_config
202     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
203     = 1;
204     ## TODO: Turn mutation events on
205     } # _terminate_tree_constructor
206    
207     ## Tree construction stage
208    
209    
210     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
211    
212     ## XML5: No namespace support.
213    
214     ## XML5: XML5 has "empty tag token". In this implementation, it is
215     ## represented as a start tag token with $token->{self_closing} flag
216     ## set to true.
217    
218     ## XML5: XML5 has "short end tag token". In this implementation, it
219     ## is represented as an end tag token with $token->{tag_name} flag set
220     ## to an empty string.
221    
222     ## XML5: Start, main, end phases. In this implementation, they are
223     ## represented by insertion modes.
224    
225     ## Insertion modes
226     sub INITIAL_IM () { 0 }
227     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
228     sub IN_ELEMENT_IM () { 2 }
229     sub AFTER_ROOT_ELEMENT_IM () { 3 }
230    
231     {
232     my $token; ## TODO: change to $self->{t}
233    
234     sub _construct_tree ($) {
235     my ($self) = @_;
236    
237     !!!next-token;
238    
239     delete $self->{tainted};
240     $self->{open_elements} = [];
241     $self->{insertion_mode} = INITIAL_IM;
242    
243     while (1) {
244     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
245     $self->_tree_in_element;
246     } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
247     $self->_tree_after_root_element;
248     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
249     $self->_tree_before_root_element;
250     } elsif ($self->{insertion_mode} == INITIAL_IM) {
251     $self->_tree_initial;
252     } else {
253     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
254     }
255    
256     last if $token->{type} == ABORT_TOKEN;
257     }
258     } # _construct_tree
259    
260     sub _tree_initial ($) {
261     my $self = shift;
262    
263     B: while (1) {
264     if ($token->{type} == DOCTYPE_TOKEN) {
265     ## XML5: No "DOCTYPE" token.
266    
267     my $doctype = $self->{document}->create_document_type_definition
268     (defined $token->{name} ? $token->{name} : '');
269    
270     ## NOTE: Default value for both |public_id| and |system_id| attributes
271     ## are empty strings, so that we don't set any value in missing cases.
272     $doctype->public_id ($token->{public_identifier})
273     if defined $token->{public_identifier};
274     $doctype->system_id ($token->{system_identifier})
275     if defined $token->{system_identifier};
276    
277     ## TODO: internal_subset
278    
279     $self->{document}->append_child ($doctype);
280    
281     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
282     !!!next-token;
283     return;
284     } elsif ($token->{type} == START_TAG_TOKEN or
285     $token->{type} == END_OF_FILE_TOKEN) {
286     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
287     ## Reprocess.
288     return;
289     } elsif ($token->{type} == COMMENT_TOKEN) {
290     my $comment = $self->{document}->create_comment ($token->{data});
291     $self->{document}->append_child ($comment);
292    
293     ## Stay in the mode.
294     !!!next-token;
295     next B;
296     } elsif ($token->{type} == PI_TOKEN) {
297     my $pi = $self->{document}->create_processing_instruction
298     ($token->{target}, $token->{data});
299     $self->{document}->append_child ($pi);
300    
301     ## Stay in the mode.
302     !!!next-token;
303     next B;
304     } elsif ($token->{type} == CHARACTER_TOKEN) {
305     if (not $self->{tainted} and
306     $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
307     #
308     }
309    
310     if (length $token->{data}) {
311     ## XML5: Ignore the token.
312    
313     unless ($self->{tainted}) {
314     !!!parse-error (type => 'text outside of root element',
315     token => $token);
316     $self->{tainted} = 1;
317     }
318    
319     $self->{document}->manakai_append_text ($token->{data});
320     }
321    
322     ## Stay in the mode.
323     !!!next-token;
324     next B;
325     } elsif ($token->{type} == END_TAG_TOKEN) {
326     !!!parse-error (type => 'unmatched end tag',
327     text => $token->{tag_name},
328     token => $token);
329     ## Ignore the token.
330    
331     ## Stay in the mode.
332     !!!next-token;
333     next B;
334     } elsif ($token->{type} == ABORT_TOKEN) {
335     return;
336     } else {
337     die "$0: XML parser initial: Unknown token type $token->{type}";
338     }
339     } # B
340     } # _tree_initial
341    
342     sub _tree_before_root_element ($) {
343     my $self = shift;
344    
345     B: while (1) {
346     if ($token->{type} == START_TAG_TOKEN) {
347     my $ns; ## TODO:
348     my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
349     $self->{document}->append_child ($el);
350    
351     if ($token->{self_closing}) {
352     !!!ack ('ack');
353     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
354     } else {
355     push @{$self->{open_elements}}, [$el, $token->{tag_name}];
356     $self->{insertion_mode} = IN_ELEMENT_IM;
357     }
358    
359     #delete $self->{tainted};
360    
361     !!!next-token;
362     return;
363     } elsif ($token->{type} == COMMENT_TOKEN) {
364     my $comment = $self->{document}->create_comment ($token->{data});
365     $self->{document}->append_child ($comment);
366    
367     ## Stay in the mode.
368     !!!next-token;
369     next B;
370     } elsif ($token->{type} == PI_TOKEN) {
371     my $pi = $self->{document}->create_processing_instruction
372     ($token->{target}, $token->{data});
373     $self->{document}->append_child ($pi);
374    
375     ## Stay in the mode.
376     !!!next-token;
377     next B;
378     } elsif ($token->{type} == CHARACTER_TOKEN) {
379     if (not $self->{tainted} and
380     $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
381     #
382     }
383    
384     if (length $token->{data}) {
385     ## XML5: Ignore the token.
386    
387     unless ($self->{tainted}) {
388     !!!parse-error (type => 'text outside of root element',
389     token => $token);
390     $self->{tainted} = 1;
391     }
392    
393     $self->{document}->manakai_append_text ($token->{data});
394     }
395    
396     ## Stay in the mode.
397     !!!next-token;
398     next B;
399     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
400     !!!parse-error (type => 'no root element',
401     token => $token);
402    
403     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
404     ## Reprocess.
405     return;
406     } elsif ($token->{type} == END_TAG_TOKEN) {
407     !!!parse-error (type => 'unmatched end tag',
408     text => $token->{tag_name},
409     token => $token);
410     ## Ignore the token.
411    
412     ## Stay in the mode.
413     !!!next-token;
414     next B;
415     } elsif ($token->{type} == DOCTYPE_TOKEN) {
416     !!!parse-error (type => 'in html:#doctype',
417     token => $token);
418     ## Ignore the token.
419    
420     ## Stay in the mode.
421     !!!next-token;
422     next B;
423     } elsif ($token->{type} == ABORT_TOKEN) {
424     return;
425     } else {
426     die "$0: XML parser initial: Unknown token type $token->{type}";
427     }
428     } # B
429     } # _tree_before_root_element
430    
431     sub _tree_in_element ($) {
432     my $self = shift;
433    
434     B: while (1) {
435     if ($token->{type} == CHARACTER_TOKEN) {
436     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
437    
438     ## Stay in the mode.
439     !!!next-token;
440     next B;
441     } elsif ($token->{type} == START_TAG_TOKEN) {
442     my $ns; ## TODO:
443     my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
444     $self->{open_elements}->[-1]->[0]->append_child ($el);
445    
446     if ($token->{self_closing}) {
447     !!!ack ('ack');
448     } else {
449     push @{$self->{open_elements}}, [$el, $token->{tag_name}];
450     }
451    
452     ## Stay in the mode.
453     !!!next-token;
454     next B;
455     } elsif ($token->{type} == END_TAG_TOKEN) {
456     if ($self->{tag_name} eq '') {
457     ## Short end tag token.
458     pop @{$self->{open_elements}};
459     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
460     pop @{$self->{open_elements}};
461     } else {
462     !!!parse-error (type => 'unmatched end tag',
463     text => $token->{tag_name},
464     token => $token);
465    
466     ## Has an element in scope
467     INSCOPE: for my $i (reverse $#{$self->{open_elements}}..0) {
468     if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
469     splice @{$self->{open_elements}}, $i;
470     last INSCOPE;
471     }
472     } # INSCOPE
473     }
474    
475     unless (@{$self->{open_elements}}) {
476     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
477     !!!next-token;
478     return;
479     } else {
480     ## Stay in the state.
481     !!!next-token;
482     redo B;
483     }
484     } elsif ($token->{type} == COMMENT_TOKEN) {
485     my $comment = $self->{document}->create_comment ($token->{data});
486     $self->{open_elements}->[-1]->[0]->append_child ($comment);
487    
488     ## Stay in the mode.
489     !!!next-token;
490     next B;
491     } elsif ($token->{type} == PI_TOKEN) {
492     my $pi = $self->{document}->create_processing_instruction
493     ($token->{target}, $token->{data});
494     $self->{open_elements}->[-1]->[0]->append_child ($pi);
495    
496     ## Stay in the mode.
497     !!!next-token;
498     next B;
499     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
500     !!!parse-error (type => 'in body:#eof',
501     token => $token);
502    
503     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
504     !!!next-token;
505     return;
506     } elsif ($token->{type} == DOCTYPE_TOKEN) {
507     !!!parse-error (type => 'in html:#doctype',
508     token => $token);
509     ## Ignore the token.
510    
511     ## Stay in the mode.
512     !!!next-token;
513     next B;
514     } elsif ($token->{type} == ABORT_TOKEN) {
515     return;
516     } else {
517     die "$0: XML parser initial: Unknown token type $token->{type}";
518     }
519     } # B
520     } # _tree_in_element
521    
522     sub _tree_after_root_element ($) {
523     my $self = shift;
524    
525     B: while (1) {
526     if ($token->{type} == START_TAG_TOKEN) {
527     !!!parse-error (type => 'second root element',
528     token => $token);
529    
530     ## XML5: Ignore the token.
531    
532     my $ns; ## TODO:
533     my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
534     $self->{document}->append_child ($el);
535    
536     if ($token->{self_closing}) {
537     !!!ack ('ack');
538     ## Stay in the mode.
539     } else {
540     push @{$self->{open_elements}}, [$el, $token->{tag_name}];
541     $self->{insertion_mode} = IN_ELEMENT_IM;
542     }
543    
544     #delete $self->{tainted};
545    
546     !!!next-token;
547     return;
548     } elsif ($token->{type} == COMMENT_TOKEN) {
549     my $comment = $self->{document}->create_comment ($token->{data});
550     $self->{document}->append_child ($comment);
551    
552     ## Stay in the mode.
553     !!!next-token;
554     next B;
555     } elsif ($token->{type} == PI_TOKEN) {
556     my $pi = $self->{document}->create_processing_instruction
557     ($token->{target}, $token->{data});
558     $self->{document}->append_child ($pi);
559    
560     ## Stay in the mode.
561     !!!next-token;
562     next B;
563     } elsif ($token->{type} == CHARACTER_TOKEN) {
564     if (not $self->{tainted} and
565     $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
566     #
567     }
568    
569     if (length $token->{data}) {
570     ## XML5: Ignore the token.
571    
572     unless ($self->{tainted}) {
573     !!!parse-error (type => 'text outside of root element',
574     token => $token);
575     $self->{tainted} = 1;
576     }
577    
578     $self->{document}->manakai_append_text ($token->{data});
579     }
580    
581     ## Stay in the mode.
582     !!!next-token;
583     next B;
584     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
585     ## Stop parsing.
586    
587     ## TODO: implement "stop parsing".
588    
589     $token = {type => ABORT_TOKEN};
590     return;
591     } elsif ($token->{type} == END_TAG_TOKEN) {
592     !!!parse-error (type => 'unmatched end tag',
593     text => $token->{tag_name},
594     token => $token);
595     ## Ignore the token.
596    
597     ## Stay in the mode.
598     !!!next-token;
599     next B;
600     } elsif ($token->{type} == DOCTYPE_TOKEN) {
601     !!!parse-error (type => 'in html:#doctype',
602     token => $token);
603     ## Ignore the token.
604    
605     ## Stay in the mode.
606     !!!next-token;
607     next B;
608     } elsif ($token->{type} == ABORT_TOKEN) {
609     return;
610     } else {
611     die "$0: XML parser initial: Unknown token type $token->{type}";
612     }
613     } # B
614     } # _tree_after_root_element
615    
616     }
617    
618     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24