/[suikacvs]/markup/html/whatpm/Whatpm/SWML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/SWML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download)
Thu Nov 6 06:57:00 2008 UTC (16 years ago) by wakaba
Branch: MAIN
++ whatpm/Whatpm/ChangeLog	6 Nov 2008 04:01:55 -0000
2008-11-06  Wakaba  <wakaba@suika.fam.cx>

	* SWML/: New directory.

++ whatpm/Whatpm/SWML/ChangeLog	6 Nov 2008 06:56:18 -0000
2008-11-06  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm: New file.  Tokenizer impelmented, but it does not
	work yet.

	* ChangeLog: New file.

1 wakaba 1.1 package Whatpm::SWML::Parser;
2     use strict;
3    
4    
5     sub new ($) {
6     my $self = bless {
7     }, $_[0];
8     return $self;
9     } # new
10    
11     sub parse_char_string ($$$;$) {
12     my $self = shift;
13     my @s = split /\x0D\x0A?|\x0A/, ref $_[0] ? ${$_[0]} : $_[0], -1;
14    
15     my $doc = $_[1];
16     $doc->inner_html
17     ('<html xmlns="http://www.w3.org/1999/xhtml"><head/><body/></html>');
18     for ($doc,
19     $doc->document_element,
20     $doc->document_element->first_child,
21     $doc->document_element->last_child) {
22     $_->set_user_data (manakai_source_line => 1);
23     $_->set_user_data (manakai_source_column => 1);
24     }
25     $self->{oe} = {node => $doc->document_element->last_child,
26     section_depth => 0,
27     quotation_depth => 0,
28     list_depth => 0};
29    
30     my $_onerror = $_[2] || sub {
31     my %opt = @_;
32     my $r = 'Line ' . $opt{line} . ' column ' . $opt{column} . ': ';
33    
34     if ($opt{token}) {
35     $r .= 'Token ' . (defined $opt{token}->{value}
36     ? $opt{token}->{value} : $opt{token}->{type}) . ': ';
37     }
38    
39     $r .= $opt{type} . ';' . $opt{level};
40    
41     warn $r . "\n";
42     }; # $_onerror
43    
44     my $line = 0;
45     my $column = 0;
46     my $token;
47     my @nt;
48    
49     my $onerror = sub {
50     $_onerror->(line => $line, column => $column, token => $token, @_);
51     }; # $onerror
52    
53     my $continuous_line;
54    
55     my $tokenize_text = sub {
56     my $s = shift; # ref
57    
58     if ($$s =~ s/^\[([0-9]+)\]//) {
59     push @nt, {type => ELEMENT_TOKEN,
60     local_name => 'anchor-end', namespace => $SW09_NS,
61     anchor => $1, content => '[' . $1 . ']'};
62     $column += $+[0] - $-[0];
63     }
64    
65     while (length $$s) {
66     if ($$s =~ s/^\[\[#([a-z-}+)//) {
67     $column = $+[0] - $-[0];
68     my $t = {type => FORM_TOKEN, name => $1,
69     line => $line, column => $column};
70     if ($$s =~ s/^\(([^()\\]*)\)//) {
71     $t->{id} = $1;
72     $column += $+[0] - $-[0];
73     }
74     my @param;
75     while ($$s =~ s/^:/) {
76     if ($$s =~ s/^'((?>[^'\\]|\\.)*)//) {
77     $column += 1 + $+[0] - $-[0];
78     my $n = $1;
79     $n =~ tr/\\//d;
80     push @param, $n;
81     $column++ if $$s =~ s/\A\\\z//;
82     $column++ if $$s =~ s/^'//;
83     } elsif ($$s =~ s/^([^':][^:]*)//) {
84     $column += 1 + $+[0] - $-[0];
85     push @param, $1;
86     }
87     }
88     $t->{parameters} = \@param;
89     $column += 2 if $$s =~ s/^\]\]//;
90     push @nt, $t;
91     } elsif ($$s =~ s/^\[\[//) {
92     push @nt, {type => INLINE_START_TAG_TOKEN};
93     $column += 2;
94     } elsif ($$s =~ s/^\[([A-Z]+)(?>\(([^()\\]*)\))?(?>\@[0-9A-Za-z-]*)?\[//) {
95     push @nt, {type => INLINE_START_TAG_TOKEN,
96     tag_name => $1, classes => $2, language => $3,
97     line => $line, column => $column};
98     $column += $+[0] - $-[0];
99     } elsif ($$s =~ s/^\]\]//) {
100     push @nt, {type => INLINE_END_TAG_TOKEN,
101     line => $line, column => $column};
102     $column += 2;
103     } elsif ($$s =~ s/^(\]?)<([0-9A-Za-z%+._-]+)://) {
104     my $t = {type => $1 ? INLINE_END_TAG_TOKEN : ELEMENT_TOKEN,
105     res_scheme => $2, res_parameter => '',
106     line => $line, column => $column};
107     $column += $+[0] - $-[0];
108    
109     while (length $$s) {
110     if ($$s =~ s/^([^>"]+)//) {
111     $t->{res_parameter} .= $1;
112     $column += $+[0] - $-[0];
113     } elsif ($$s =~ s/^("(?>[^"\\]|\\.)*)//) {
114     $t->{res_parameter} .= $1;
115     $column += $+[0] - $-[0];
116     $column++ if $$s =~ s/\A\\\z//;
117     $column++ if $$s =~ s/^"//;
118     } else {
119     last;
120     }
121     }
122    
123     $column++ if $$s =~ s/^>//;
124    
125     $t->{content} = $t->{res_scheme} . ':' . $t->{res_parameter};
126     if ($t->{res_scheme} !~ /[A-Z]/) {
127     $t->{res_parameter} = $t->{content};
128     $t->{res_scheme} = 'URI';
129     }
130    
131     if ($t->{type} == INLINE_END_TAG_TOKEN) {
132     $column++ if $$s =~ s/^\]//;
133     } else {
134     $t->{local_name} = 'anchor-external';
135     $t->{namespace} = $SW09_NS;
136     }
137     push @nt, $t;
138     } elsif ($$s =~ s/^\]>>([0-9]+)\]//) {
139     push @nt, {type => INLINE_END_TAG_TOKEN,
140     anchor => $1,
141     line => $line, column => $column};
142     $column += $+[0] - $-[0];
143     } elsif ($$s =~ s/^\][\x09\x20]*(?>\@([0-9a-zA-Z-]*))?\[//) {
144     push @nt, {type => INLINE_MIDDLE_TAG_TOKEN,
145     language => $1,
146     line => $line, column => $column};
147     $column += $+[0] - $-[0];
148     } elsif ($$s =~ s/\^''('?)//) {
149     push @nt, {type => $1 ? STRONG_TOKEN : EMPHASIS_TOKEN,
150     line => $line, column => $column};
151     $column += $+[0] - $-[0];
152     } elsif ($$s =~ s/^>>([0-9]+)//) {
153     push @nt, {type => ELEMENT_TOKEN,
154     local_name => 'anchor-internal', namespace => $SW09_NS,
155     anchor => $1,
156     line => $line, column => $column};
157     $column += $+[0] - $-[0];
158     } elsif ($$s =~ s/^__&&//) {
159     if ($$s =~ s/^(.+?)&&__//) {
160     push @nt, {type => ELEMENT_TOKEN,
161     local_name => 'replace', namespace => $SW09_NS,
162     by => $1,
163     line => $line, column => $column};
164     $column += 4 + $+[0] - $-[0];
165     } else {
166     push @nt, {type => CHARACTER_TOKEN,
167     data => '__&&',
168     line => $line, column => $column};
169     $column += 4;
170     }
171     } elsif ($$s =~ s/^([^<>\['_]+)//) {
172     push @nt, {type => CHARACTER_TOKEN, data => $1,
173     line => $line, column => $column};
174     $column += $+[0] - $-[0];
175     } else {
176     push @nt, {type => CHARACTER_TOKEN, data => substr ($$s, 0, 1),
177     line => $line, column => $column};
178     substr ($$s, 0, 1) = '';
179     $column++;
180     }
181     }
182     }; # $tokenize_text
183    
184     my $get_next_token = sub {
185     if (@nt) {
186     return shift @nt;
187     }
188    
189     if (not @s) {
190     return {type => END_OF_FILE_TOKEN, line => $line, column => $column};
191     }
192    
193     my $s = shift @s;
194     ($line, $column) = ($line + 1, 1);
195     if ($s eq '') {
196     undef $continuous_line;
197     return {type => EMPTY_LINE_TOKEN, line => $line, column => $column};
198     } elsif ($s =~ /^[\x09\x20]/) {
199     push @nt, {type => PREFORMATTED_START_TOKEN,
200     line => $line, column => $column};
201     $tokenize_text->(\$s);
202     while (@s) {
203     my $s = shift @s;
204     ($line, $column) = ($line + 1, 1);
205     if ($s eq '') {
206     push @nt, {type => PREFORMATTED_END_TOKEN,
207     line => $line, column => $column};
208     unshift @s, $s;
209     $line--;
210     last;
211     } elsif ($s =~ /\A\](INS|DEL)\][\x09\x20]*\z/) {
212     push @nt, {type => PREFORMATTED_END_TOKEN,
213     line => $line, column => $column};
214     push @nt, {type => BLOCK_END_TAG_TOKEN, tag_name => $1,
215     line => $line, column => $column};
216     last;
217     } else {
218     push @nt, {type => CHARACTER_TOKEN, data => "\x0A",
219     line => $line, column => $column};
220     $tokenize_text->(\$s);
221     }
222     }
223     return shift @nt;
224     } elsif ($s =~ s/^(\*+)\s*//) {
225     push @nt, {type => HEADING_START_TOKEN, depth => length $1,
226     line => $line, column => $column};
227     $column += $+[0] - $-[0];
228     $tokenize_text->(\$s);
229     push @nt, {type => HEADING_END_TOKEN,
230     line => $line, column => $column};
231     undef $continuous_line;
232     return shift @nt;
233     } elsif ($s =~ s/^([-=]+)\s*//) {
234     push @nt, {type => LIST_START_TOKEN, depth => $1,
235     line => $line, column => $column};
236     $column += $+[0] - $-[0];
237     $tokenize_text->(\$s);
238     $continuous_line = 1;
239     return shift @nt;
240     } elsif ($s =~ s/^:([^:]*)//) {
241     my $name = $1;
242     if ($s eq '') {
243     push @nt, {type => CHARACTER_TOKEN, data => ':',
244     line => $line, column => $column};
245     $column++;
246     $tokenize_text->(\$name);
247     } else {
248     my $real_column = $column + 1 + length $name;
249     push @nt, {type => LABELED_LIST_START_TOKEN,
250     line => $line, column => $column};
251     $name =~ s/\A[\x09\x20]*//;
252     $column += 1 + $+[0] - $-[0];
253     $name =~ s/[\x09\x20]+\z//;
254     $tokenize_text->(\$s);
255     $column = $real_column;
256     push @nt, {type => LABELED_LIST_MIDDLE_TOKEN,
257     line => $line, column => $column};
258     $column += $+[0] - $-[0] if $data =~ s/^:[\x09\x20]*//;
259     $tokenize_text->(\$s);
260     }
261     $continuous_line = 1;
262     return shift @nt;
263     } elsif ($s =~ s/^(>+)//) {
264     my $depth = length $1;
265     if ($depth == 2 and $s =~ /^[0-9]/) {
266     push @nt, {type => CHARACTER_TOKEN, data => "\x0A",
267     line => $line, column => $column}
268     if $continuous_line;
269     $s = '>>' . $s;
270     $tokenize_text->(\$s);
271     } else {
272     push @nt, {type => QUOTATION_START_TOKEN, depth => $depth,
273     line => $line, column => $column};
274     $column += $depth;
275     $column += $+[0] - $-[0] if $s =~ s/^[\x09\x20]+//;
276     if ($s =~ s/^\@\@[\x09\x20]*//) {
277     push @nt, {type => EDITORIAL_NOTE_START_TOKEN,
278     line => $line, column => $column};
279     $column += $+[0] - $-[0];
280     } elsif ($s =~ s/^;;[\x09\x20]*//) {
281     push @nt, {type => COMMENT_PARAGRAPH_START_TOKEN,
282     line => $line, column => $column};
283     $column += $+[0] - $-[0];
284     }
285     $tokenize_text->(\$s);
286     }
287     $continuous_line = 1;
288     return shift @nt;
289     } elsif ($s =~ /\A\[(INS|DEL)(?>\(([^()\\]*)\))?\[[\x09\x20]*\z/) {
290     undef $continuous_line;
291     return {type => BLOCK_START_TAG_TOKEN, tag_name => $1,
292     classes => $2,
293     line => $line, column => $column};
294     } elsif ($s =~ /\A\[PRE(?>\(([^()\\]*)\))?\[[\x09\x20]*\z/) {
295     undef $continuous_line;
296     push @nt, {type => BLOCK_START_TAG_TOKEN, tag_name => 'PRE',
297     classes => $1,
298     line => $line, column => $column};
299     while (@s) {
300     my $s = shift @s;
301     ($line, $column) = ($line + 1, 1);
302     if ($s =~ /\A\]PRE\][\x09\x20]*\z/) {
303     push @nt, {type => BLOCK_END_TAG_TOKEN, tag_name => 'PRE',
304     line => $line, column => $column};
305     undef $continuous_line;
306     break;
307     } else {
308     push @nt, {type => CHARACTER_TOKEN, data => "\x0A",
309     line => $line, column => $column}
310     if $continuous_line;
311     $tokenize_text->(\$s);
312     $continuous_line = 1;
313     }
314     }
315     return shift @nt;
316     } elsif ($s =~ s/^\@\@[\x09\x20]*//) {
317     push @nt, {type => EDITORIAL_NOTE_START_TOKEN,
318     line => $line, column => $column};
319     $column += $+[0] - $-[0];
320     $tokenize_text->(\$s);
321     $continuous_line = 1;
322     return shift @nt;
323     } elsif ($s =~ s/^;;[\x09\x20]*//) {
324     push @nt, {type => COMMENT_PARAGRAPH_START_TOKEN,
325     line => $line, column => $column};
326     $column += $+[0] - $-[0];
327     $tokenize_text->(\$s);
328     undef $continuous_line;
329     return shift @nt;
330     } elsif ($s =~ /\A\](INS|DEL)\][\x09\x20]*\z/) {
331     $continuous_line = 1;
332     return {type => BLOCK_END_TAG_TOKEN, tag_name => $1,
333     line => $line, column => $column};
334     } elsif ($s =~ /^,/) {
335     push @nt, {type => TABLE_ROW_START_TOKEN,
336     line => $line, column => $column};
337     while ($s =~ s/^,[\x09\x20]*//) {
338     $column += $+[0] - $-[0];
339     my $cell;
340     my $cell_quoted;
341     my $column_quoted = $column;
342     my $column_cell = $column;
343     if ($s =~ s/^"//) {
344     $s =~ s/^((?>[^"\\]|\\.)*)//;
345     $cell_quoted = $1;
346     $column += 1 + length $cell_quoted;
347     $cell_quoted =~ tr/\\//d;
348     $column++ if $s =~ s/\A\\\z//;
349     $column++ if $s =~ s/^"//;
350     }
351     if ($s =~ s/^([^,]+)//) {
352     $cell = $1;
353     $column += length $cell;
354     $cell =~ s/[\x09\x20]+\z//;
355     }
356     if (not defined $cell_quoted and defined $cell and
357     $cell eq '==') {
358     push @nt, {type => TABLE_COLSPAN_CELL_TOKEN,
359     line => $line, column => $column_cell};
360     } else {
361     push @nt, {type => TABLE_CELL_START_TOKEN,
362     line => $line,
363     column => defined $column_quoted ? $column_quoted: $column_cell};
364     my $real_column = $column;
365     $column = $column_quoted + 1;
366     $tokenize_text->(\$cell_quoted) if defined $cell_quoted;
367     ## NOTE: When a quoted-pair is used, column numbers
368     ## reported in this $tokenize_text call might be wrong.
369     $column = $column_cell;
370     $tokenize_text->(\$cell) if defined $cell;
371     $column = $column_quoted;
372     push @nt, {type => TABLE_CELL_END_TOKEN,
373     line => $line,
374     column => $column};
375     }
376     }
377     push @nt, {type => TABLE_ROW_END_TOKEN,
378     line => $line, column => $column};
379     undef $continuous_line;
380     return shift @nt;
381     } elsif ($s eq '__IMAGE__') {
382     my $image = $doc->create_element_ns ($NS_SW09, 'image');
383     $_->set_user_data (manakai_source_line => $line);
384     $_->set_user_data (manakai_source_column => 1);
385     $image->text_content (join "\x0A", $s, @s);
386     ($line, $column) = ($line + @s, 1);
387     @s = ();
388     $doc->document_element->append_child ($image);
389     return {type => END_OF_FILE_TOKEN,
390     line => $line, column => $column};
391     } else {
392     push @nt, {type => CHARACTER_TOKEN, data => "\x0A",
393     line => $line, column => $column} if $continuous_line;
394     $tokenize_text->(\$s);
395     $continuous_line = 1;
396     return shift @nt;
397     }
398     }; # $get_next_token_body
399    
400     ## NOTE: The "initial" mode.
401     if (@s and $s[0] =~ /^#\?/) {
402     ## NOTE: "Parse a magic line".
403    
404     my $s = shift @s;
405    
406     ## TODO:...
407    
408     $line = 2;
409     $column = 0;
410     }
411    
412     ## NOTE: Switched to the "body" mode.
413     $get_next_token->();
414    
415     } # parse_char_string
416    
417     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24