/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Tue Oct 14 04:32:50 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 04:28:43 -0000
	* Tokenizer.pm.src: Make *_TOKEN (token type constants)
	exportable.  New token types, PI_TOKEN for XML and ABORT_TOKEN for
	document.write() or incremental parsing, are added for future
	extensions.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 04:27:29 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile, Parser.pm.src: New files.

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82
83 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182 return $self;
183 } # new
184
185 sub _initialize_tree_constructor ($) {
186 my $self = shift;
187 ## NOTE: $self->{document} MUST be specified before this method is called
188 $self->{document}->strict_error_checking (0);
189 ## TODO: Turn mutation events off # MUST
190 $self->{document}->dom_config
191 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
192 = 0;
193 $self->{document}->manakai_is_html (0);
194 $self->{document}->set_user_data (manakai_source_line => 1);
195 $self->{document}->set_user_data (manakai_source_column => 1);
196 } # _initialize_tree_constructor
197
198 sub _terminate_tree_constructor ($) {
199 my $self = shift;
200 $self->{document}->strict_error_checking (1);
201 $self->{document}->dom_config
202 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
203 = 1;
204 ## TODO: Turn mutation events on
205 } # _terminate_tree_constructor
206
207 ## Tree construction stage
208
209
210 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
211
212 ## XML5: No namespace support.
213
214 ## XML5: XML5 has "empty tag token". In this implementation, it is
215 ## represented as a start tag token with $token->{self_closing} flag
216 ## set to true.
217
218 ## XML5: XML5 has "short end tag token". In this implementation, it
219 ## is represented as an end tag token with $token->{tag_name} flag set
220 ## to an empty string.
221
222 ## XML5: Start, main, end phases. In this implementation, they are
223 ## represented by insertion modes.
224
225 ## Insertion modes
226 sub INITIAL_IM () { 0 }
227 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
228 sub IN_ELEMENT_IM () { 2 }
229 sub AFTER_ROOT_ELEMENT_IM () { 3 }
230
231 {
232 my $token; ## TODO: change to $self->{t}
233
234 sub _construct_tree ($) {
235 my ($self) = @_;
236
237 $token = $self->_get_next_token;
238
239 delete $self->{tainted};
240 $self->{open_elements} = [];
241 $self->{insertion_mode} = INITIAL_IM;
242
243 while (1) {
244 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
245 $self->_tree_in_element;
246 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
247 $self->_tree_after_root_element;
248 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
249 $self->_tree_before_root_element;
250 } elsif ($self->{insertion_mode} == INITIAL_IM) {
251 $self->_tree_initial;
252 } else {
253 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
254 }
255
256 last if $token->{type} == ABORT_TOKEN;
257 }
258 } # _construct_tree
259
260 sub _tree_initial ($) {
261 my $self = shift;
262
263 B: while (1) {
264 if ($token->{type} == DOCTYPE_TOKEN) {
265 ## XML5: No "DOCTYPE" token.
266
267 my $doctype = $self->{document}->create_document_type_definition
268 (defined $token->{name} ? $token->{name} : '');
269
270 ## NOTE: Default value for both |public_id| and |system_id| attributes
271 ## are empty strings, so that we don't set any value in missing cases.
272 $doctype->public_id ($token->{public_identifier})
273 if defined $token->{public_identifier};
274 $doctype->system_id ($token->{system_identifier})
275 if defined $token->{system_identifier};
276
277 ## TODO: internal_subset
278
279 $self->{document}->append_child ($doctype);
280
281 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
282 $token = $self->_get_next_token;
283 return;
284 } elsif ($token->{type} == START_TAG_TOKEN or
285 $token->{type} == END_OF_FILE_TOKEN) {
286 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
287 ## Reprocess.
288 return;
289 } elsif ($token->{type} == COMMENT_TOKEN) {
290 my $comment = $self->{document}->create_comment ($token->{data});
291 $self->{document}->append_child ($comment);
292
293 ## Stay in the mode.
294 $token = $self->_get_next_token;
295 next B;
296 } elsif ($token->{type} == PI_TOKEN) {
297 my $pi = $self->{document}->create_processing_instruction
298 ($token->{target}, $token->{data});
299 $self->{document}->append_child ($pi);
300
301 ## Stay in the mode.
302 $token = $self->_get_next_token;
303 next B;
304 } elsif ($token->{type} == CHARACTER_TOKEN) {
305 if (not $self->{tainted} and
306 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
307 #
308 }
309
310 if (length $token->{data}) {
311 ## XML5: Ignore the token.
312
313 unless ($self->{tainted}) {
314 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
315 token => $token);
316 $self->{tainted} = 1;
317 }
318
319 $self->{document}->manakai_append_text ($token->{data});
320 }
321
322 ## Stay in the mode.
323 $token = $self->_get_next_token;
324 next B;
325 } elsif ($token->{type} == END_TAG_TOKEN) {
326 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
327 text => $token->{tag_name},
328 token => $token);
329 ## Ignore the token.
330
331 ## Stay in the mode.
332 $token = $self->_get_next_token;
333 next B;
334 } elsif ($token->{type} == ABORT_TOKEN) {
335 return;
336 } else {
337 die "$0: XML parser initial: Unknown token type $token->{type}";
338 }
339 } # B
340 } # _tree_initial
341
342 sub _tree_before_root_element ($) {
343 my $self = shift;
344
345 B: while (1) {
346 if ($token->{type} == START_TAG_TOKEN) {
347 my $ns; ## TODO:
348 my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
349 $self->{document}->append_child ($el);
350
351 if ($token->{self_closing}) {
352 delete $self->{self_closing};
353 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
354 } else {
355 push @{$self->{open_elements}}, [$el, $token->{tag_name}];
356 $self->{insertion_mode} = IN_ELEMENT_IM;
357 }
358
359 #delete $self->{tainted};
360
361 $token = $self->_get_next_token;
362 return;
363 } elsif ($token->{type} == COMMENT_TOKEN) {
364 my $comment = $self->{document}->create_comment ($token->{data});
365 $self->{document}->append_child ($comment);
366
367 ## Stay in the mode.
368 $token = $self->_get_next_token;
369 next B;
370 } elsif ($token->{type} == PI_TOKEN) {
371 my $pi = $self->{document}->create_processing_instruction
372 ($token->{target}, $token->{data});
373 $self->{document}->append_child ($pi);
374
375 ## Stay in the mode.
376 $token = $self->_get_next_token;
377 next B;
378 } elsif ($token->{type} == CHARACTER_TOKEN) {
379 if (not $self->{tainted} and
380 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
381 #
382 }
383
384 if (length $token->{data}) {
385 ## XML5: Ignore the token.
386
387 unless ($self->{tainted}) {
388 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
389 token => $token);
390 $self->{tainted} = 1;
391 }
392
393 $self->{document}->manakai_append_text ($token->{data});
394 }
395
396 ## Stay in the mode.
397 $token = $self->_get_next_token;
398 next B;
399 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
400 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
401 token => $token);
402
403 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
404 ## Reprocess.
405 return;
406 } elsif ($token->{type} == END_TAG_TOKEN) {
407 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
408 text => $token->{tag_name},
409 token => $token);
410 ## Ignore the token.
411
412 ## Stay in the mode.
413 $token = $self->_get_next_token;
414 next B;
415 } elsif ($token->{type} == DOCTYPE_TOKEN) {
416 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
417 token => $token);
418 ## Ignore the token.
419
420 ## Stay in the mode.
421 $token = $self->_get_next_token;
422 next B;
423 } elsif ($token->{type} == ABORT_TOKEN) {
424 return;
425 } else {
426 die "$0: XML parser initial: Unknown token type $token->{type}";
427 }
428 } # B
429 } # _tree_before_root_element
430
431 sub _tree_in_element ($) {
432 my $self = shift;
433
434 B: while (1) {
435 if ($token->{type} == CHARACTER_TOKEN) {
436 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
437
438 ## Stay in the mode.
439 $token = $self->_get_next_token;
440 next B;
441 } elsif ($token->{type} == START_TAG_TOKEN) {
442 my $ns; ## TODO:
443 my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
444 $self->{open_elements}->[-1]->[0]->append_child ($el);
445
446 if ($token->{self_closing}) {
447 delete $self->{self_closing};
448 } else {
449 push @{$self->{open_elements}}, [$el, $token->{tag_name}];
450 }
451
452 ## Stay in the mode.
453 $token = $self->_get_next_token;
454 next B;
455 } elsif ($token->{type} == END_TAG_TOKEN) {
456 if ($self->{tag_name} eq '') {
457 ## Short end tag token.
458 pop @{$self->{open_elements}};
459 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
460 pop @{$self->{open_elements}};
461 } else {
462 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
463 text => $token->{tag_name},
464 token => $token);
465
466 ## Has an element in scope
467 INSCOPE: for my $i (reverse $#{$self->{open_elements}}..0) {
468 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
469 splice @{$self->{open_elements}}, $i;
470 last INSCOPE;
471 }
472 } # INSCOPE
473 }
474
475 unless (@{$self->{open_elements}}) {
476 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
477 $token = $self->_get_next_token;
478 return;
479 } else {
480 ## Stay in the state.
481 $token = $self->_get_next_token;
482 redo B;
483 }
484 } elsif ($token->{type} == COMMENT_TOKEN) {
485 my $comment = $self->{document}->create_comment ($token->{data});
486 $self->{open_elements}->[-1]->[0]->append_child ($comment);
487
488 ## Stay in the mode.
489 $token = $self->_get_next_token;
490 next B;
491 } elsif ($token->{type} == PI_TOKEN) {
492 my $pi = $self->{document}->create_processing_instruction
493 ($token->{target}, $token->{data});
494 $self->{open_elements}->[-1]->[0]->append_child ($pi);
495
496 ## Stay in the mode.
497 $token = $self->_get_next_token;
498 next B;
499 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
500 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
501 token => $token);
502
503 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
504 $token = $self->_get_next_token;
505 return;
506 } elsif ($token->{type} == DOCTYPE_TOKEN) {
507 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
508 token => $token);
509 ## Ignore the token.
510
511 ## Stay in the mode.
512 $token = $self->_get_next_token;
513 next B;
514 } elsif ($token->{type} == ABORT_TOKEN) {
515 return;
516 } else {
517 die "$0: XML parser initial: Unknown token type $token->{type}";
518 }
519 } # B
520 } # _tree_in_element
521
522 sub _tree_after_root_element ($) {
523 my $self = shift;
524
525 B: while (1) {
526 if ($token->{type} == START_TAG_TOKEN) {
527 $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
528 token => $token);
529
530 ## XML5: Ignore the token.
531
532 my $ns; ## TODO:
533 my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
534 $self->{document}->append_child ($el);
535
536 if ($token->{self_closing}) {
537 delete $self->{self_closing};
538 ## Stay in the mode.
539 } else {
540 push @{$self->{open_elements}}, [$el, $token->{tag_name}];
541 $self->{insertion_mode} = IN_ELEMENT_IM;
542 }
543
544 #delete $self->{tainted};
545
546 $token = $self->_get_next_token;
547 return;
548 } elsif ($token->{type} == COMMENT_TOKEN) {
549 my $comment = $self->{document}->create_comment ($token->{data});
550 $self->{document}->append_child ($comment);
551
552 ## Stay in the mode.
553 $token = $self->_get_next_token;
554 next B;
555 } elsif ($token->{type} == PI_TOKEN) {
556 my $pi = $self->{document}->create_processing_instruction
557 ($token->{target}, $token->{data});
558 $self->{document}->append_child ($pi);
559
560 ## Stay in the mode.
561 $token = $self->_get_next_token;
562 next B;
563 } elsif ($token->{type} == CHARACTER_TOKEN) {
564 if (not $self->{tainted} and
565 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
566 #
567 }
568
569 if (length $token->{data}) {
570 ## XML5: Ignore the token.
571
572 unless ($self->{tainted}) {
573 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
574 token => $token);
575 $self->{tainted} = 1;
576 }
577
578 $self->{document}->manakai_append_text ($token->{data});
579 }
580
581 ## Stay in the mode.
582 $token = $self->_get_next_token;
583 next B;
584 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
585 ## Stop parsing.
586
587 ## TODO: implement "stop parsing".
588
589 $token = {type => ABORT_TOKEN};
590 return;
591 } elsif ($token->{type} == END_TAG_TOKEN) {
592 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
593 text => $token->{tag_name},
594 token => $token);
595 ## Ignore the token.
596
597 ## Stay in the mode.
598 $token = $self->_get_next_token;
599 next B;
600 } elsif ($token->{type} == DOCTYPE_TOKEN) {
601 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
602 token => $token);
603 ## Ignore the token.
604
605 ## Stay in the mode.
606 $token = $self->_get_next_token;
607 next B;
608 } elsif ($token->{type} == ABORT_TOKEN) {
609 return;
610 } else {
611 die "$0: XML parser initial: Unknown token type $token->{type}";
612 }
613 } # B
614 } # _tree_after_root_element
615
616 }
617
618 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24