1 |
wakaba |
1.1 |
package Whatpm::SWML::Parser; |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
|
5 |
|
|
sub new ($) { |
6 |
|
|
my $self = bless { |
7 |
|
|
}, $_[0]; |
8 |
|
|
return $self; |
9 |
|
|
} # new |
10 |
|
|
|
11 |
|
|
sub parse_char_string ($$$;$) { |
12 |
|
|
my $self = shift; |
13 |
|
|
my @s = split /\x0D\x0A?|\x0A/, ref $_[0] ? ${$_[0]} : $_[0], -1; |
14 |
|
|
|
15 |
|
|
my $doc = $_[1]; |
16 |
|
|
$doc->inner_html |
17 |
|
|
('<html xmlns="http://www.w3.org/1999/xhtml"><head/><body/></html>'); |
18 |
|
|
for ($doc, |
19 |
|
|
$doc->document_element, |
20 |
|
|
$doc->document_element->first_child, |
21 |
|
|
$doc->document_element->last_child) { |
22 |
|
|
$_->set_user_data (manakai_source_line => 1); |
23 |
|
|
$_->set_user_data (manakai_source_column => 1); |
24 |
|
|
} |
25 |
|
|
$self->{oe} = {node => $doc->document_element->last_child, |
26 |
|
|
section_depth => 0, |
27 |
|
|
quotation_depth => 0, |
28 |
|
|
list_depth => 0}; |
29 |
|
|
|
30 |
|
|
my $_onerror = $_[2] || sub { |
31 |
|
|
my %opt = @_; |
32 |
|
|
my $r = 'Line ' . $opt{line} . ' column ' . $opt{column} . ': '; |
33 |
|
|
|
34 |
|
|
if ($opt{token}) { |
35 |
|
|
$r .= 'Token ' . (defined $opt{token}->{value} |
36 |
|
|
? $opt{token}->{value} : $opt{token}->{type}) . ': '; |
37 |
|
|
} |
38 |
|
|
|
39 |
|
|
$r .= $opt{type} . ';' . $opt{level}; |
40 |
|
|
|
41 |
|
|
warn $r . "\n"; |
42 |
|
|
}; # $_onerror |
43 |
|
|
|
44 |
|
|
my $line = 0; |
45 |
|
|
my $column = 0; |
46 |
|
|
my $token; |
47 |
|
|
my @nt; |
48 |
|
|
|
49 |
|
|
my $onerror = sub { |
50 |
|
|
$_onerror->(line => $line, column => $column, token => $token, @_); |
51 |
|
|
}; # $onerror |
52 |
|
|
|
53 |
|
|
my $continuous_line; |
54 |
|
|
|
55 |
|
|
my $tokenize_text = sub { |
56 |
|
|
my $s = shift; # ref |
57 |
|
|
|
58 |
|
|
if ($$s =~ s/^\[([0-9]+)\]//) { |
59 |
|
|
push @nt, {type => ELEMENT_TOKEN, |
60 |
|
|
local_name => 'anchor-end', namespace => $SW09_NS, |
61 |
|
|
anchor => $1, content => '[' . $1 . ']'}; |
62 |
|
|
$column += $+[0] - $-[0]; |
63 |
|
|
} |
64 |
|
|
|
65 |
|
|
while (length $$s) { |
66 |
|
|
if ($$s =~ s/^\[\[#([a-z-}+)//) { |
67 |
|
|
$column = $+[0] - $-[0]; |
68 |
|
|
my $t = {type => FORM_TOKEN, name => $1, |
69 |
|
|
line => $line, column => $column}; |
70 |
|
|
if ($$s =~ s/^\(([^()\\]*)\)//) { |
71 |
|
|
$t->{id} = $1; |
72 |
|
|
$column += $+[0] - $-[0]; |
73 |
|
|
} |
74 |
|
|
my @param; |
75 |
|
|
while ($$s =~ s/^:/) { |
76 |
|
|
if ($$s =~ s/^'((?>[^'\\]|\\.)*)//) { |
77 |
|
|
$column += 1 + $+[0] - $-[0]; |
78 |
|
|
my $n = $1; |
79 |
|
|
$n =~ tr/\\//d; |
80 |
|
|
push @param, $n; |
81 |
|
|
$column++ if $$s =~ s/\A\\\z//; |
82 |
|
|
$column++ if $$s =~ s/^'//; |
83 |
|
|
} elsif ($$s =~ s/^([^':][^:]*)//) { |
84 |
|
|
$column += 1 + $+[0] - $-[0]; |
85 |
|
|
push @param, $1; |
86 |
|
|
} |
87 |
|
|
} |
88 |
|
|
$t->{parameters} = \@param; |
89 |
|
|
$column += 2 if $$s =~ s/^\]\]//; |
90 |
|
|
push @nt, $t; |
91 |
|
|
} elsif ($$s =~ s/^\[\[//) { |
92 |
|
|
push @nt, {type => INLINE_START_TAG_TOKEN}; |
93 |
|
|
$column += 2; |
94 |
|
|
} elsif ($$s =~ s/^\[([A-Z]+)(?>\(([^()\\]*)\))?(?>\@[0-9A-Za-z-]*)?\[//) { |
95 |
|
|
push @nt, {type => INLINE_START_TAG_TOKEN, |
96 |
|
|
tag_name => $1, classes => $2, language => $3, |
97 |
|
|
line => $line, column => $column}; |
98 |
|
|
$column += $+[0] - $-[0]; |
99 |
|
|
} elsif ($$s =~ s/^\]\]//) { |
100 |
|
|
push @nt, {type => INLINE_END_TAG_TOKEN, |
101 |
|
|
line => $line, column => $column}; |
102 |
|
|
$column += 2; |
103 |
|
|
} elsif ($$s =~ s/^(\]?)<([0-9A-Za-z%+._-]+)://) { |
104 |
|
|
my $t = {type => $1 ? INLINE_END_TAG_TOKEN : ELEMENT_TOKEN, |
105 |
|
|
res_scheme => $2, res_parameter => '', |
106 |
|
|
line => $line, column => $column}; |
107 |
|
|
$column += $+[0] - $-[0]; |
108 |
|
|
|
109 |
|
|
while (length $$s) { |
110 |
|
|
if ($$s =~ s/^([^>"]+)//) { |
111 |
|
|
$t->{res_parameter} .= $1; |
112 |
|
|
$column += $+[0] - $-[0]; |
113 |
|
|
} elsif ($$s =~ s/^("(?>[^"\\]|\\.)*)//) { |
114 |
|
|
$t->{res_parameter} .= $1; |
115 |
|
|
$column += $+[0] - $-[0]; |
116 |
|
|
$column++ if $$s =~ s/\A\\\z//; |
117 |
|
|
$column++ if $$s =~ s/^"//; |
118 |
|
|
} else { |
119 |
|
|
last; |
120 |
|
|
} |
121 |
|
|
} |
122 |
|
|
|
123 |
|
|
$column++ if $$s =~ s/^>//; |
124 |
|
|
|
125 |
|
|
$t->{content} = $t->{res_scheme} . ':' . $t->{res_parameter}; |
126 |
|
|
if ($t->{res_scheme} !~ /[A-Z]/) { |
127 |
|
|
$t->{res_parameter} = $t->{content}; |
128 |
|
|
$t->{res_scheme} = 'URI'; |
129 |
|
|
} |
130 |
|
|
|
131 |
|
|
if ($t->{type} == INLINE_END_TAG_TOKEN) { |
132 |
|
|
$column++ if $$s =~ s/^\]//; |
133 |
|
|
} else { |
134 |
|
|
$t->{local_name} = 'anchor-external'; |
135 |
|
|
$t->{namespace} = $SW09_NS; |
136 |
|
|
} |
137 |
|
|
push @nt, $t; |
138 |
|
|
} elsif ($$s =~ s/^\]>>([0-9]+)\]//) { |
139 |
|
|
push @nt, {type => INLINE_END_TAG_TOKEN, |
140 |
|
|
anchor => $1, |
141 |
|
|
line => $line, column => $column}; |
142 |
|
|
$column += $+[0] - $-[0]; |
143 |
|
|
} elsif ($$s =~ s/^\][\x09\x20]*(?>\@([0-9a-zA-Z-]*))?\[//) { |
144 |
|
|
push @nt, {type => INLINE_MIDDLE_TAG_TOKEN, |
145 |
|
|
language => $1, |
146 |
|
|
line => $line, column => $column}; |
147 |
|
|
$column += $+[0] - $-[0]; |
148 |
|
|
} elsif ($$s =~ s/\^''('?)//) { |
149 |
|
|
push @nt, {type => $1 ? STRONG_TOKEN : EMPHASIS_TOKEN, |
150 |
|
|
line => $line, column => $column}; |
151 |
|
|
$column += $+[0] - $-[0]; |
152 |
|
|
} elsif ($$s =~ s/^>>([0-9]+)//) { |
153 |
|
|
push @nt, {type => ELEMENT_TOKEN, |
154 |
|
|
local_name => 'anchor-internal', namespace => $SW09_NS, |
155 |
|
|
anchor => $1, |
156 |
|
|
line => $line, column => $column}; |
157 |
|
|
$column += $+[0] - $-[0]; |
158 |
|
|
} elsif ($$s =~ s/^__&&//) { |
159 |
|
|
if ($$s =~ s/^(.+?)&&__//) { |
160 |
|
|
push @nt, {type => ELEMENT_TOKEN, |
161 |
|
|
local_name => 'replace', namespace => $SW09_NS, |
162 |
|
|
by => $1, |
163 |
|
|
line => $line, column => $column}; |
164 |
|
|
$column += 4 + $+[0] - $-[0]; |
165 |
|
|
} else { |
166 |
|
|
push @nt, {type => CHARACTER_TOKEN, |
167 |
|
|
data => '__&&', |
168 |
|
|
line => $line, column => $column}; |
169 |
|
|
$column += 4; |
170 |
|
|
} |
171 |
|
|
} elsif ($$s =~ s/^([^<>\['_]+)//) { |
172 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => $1, |
173 |
|
|
line => $line, column => $column}; |
174 |
|
|
$column += $+[0] - $-[0]; |
175 |
|
|
} else { |
176 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => substr ($$s, 0, 1), |
177 |
|
|
line => $line, column => $column}; |
178 |
|
|
substr ($$s, 0, 1) = ''; |
179 |
|
|
$column++; |
180 |
|
|
} |
181 |
|
|
} |
182 |
|
|
}; # $tokenize_text |
183 |
|
|
|
184 |
|
|
my $get_next_token = sub { |
185 |
|
|
if (@nt) { |
186 |
|
|
return shift @nt; |
187 |
|
|
} |
188 |
|
|
|
189 |
|
|
if (not @s) { |
190 |
|
|
return {type => END_OF_FILE_TOKEN, line => $line, column => $column}; |
191 |
|
|
} |
192 |
|
|
|
193 |
|
|
my $s = shift @s; |
194 |
|
|
($line, $column) = ($line + 1, 1); |
195 |
|
|
if ($s eq '') { |
196 |
|
|
undef $continuous_line; |
197 |
|
|
return {type => EMPTY_LINE_TOKEN, line => $line, column => $column}; |
198 |
|
|
} elsif ($s =~ /^[\x09\x20]/) { |
199 |
|
|
push @nt, {type => PREFORMATTED_START_TOKEN, |
200 |
|
|
line => $line, column => $column}; |
201 |
|
|
$tokenize_text->(\$s); |
202 |
|
|
while (@s) { |
203 |
|
|
my $s = shift @s; |
204 |
|
|
($line, $column) = ($line + 1, 1); |
205 |
|
|
if ($s eq '') { |
206 |
|
|
push @nt, {type => PREFORMATTED_END_TOKEN, |
207 |
|
|
line => $line, column => $column}; |
208 |
|
|
unshift @s, $s; |
209 |
|
|
$line--; |
210 |
|
|
last; |
211 |
|
|
} elsif ($s =~ /\A\](INS|DEL)\][\x09\x20]*\z/) { |
212 |
|
|
push @nt, {type => PREFORMATTED_END_TOKEN, |
213 |
|
|
line => $line, column => $column}; |
214 |
|
|
push @nt, {type => BLOCK_END_TAG_TOKEN, tag_name => $1, |
215 |
|
|
line => $line, column => $column}; |
216 |
|
|
last; |
217 |
|
|
} else { |
218 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => "\x0A", |
219 |
|
|
line => $line, column => $column}; |
220 |
|
|
$tokenize_text->(\$s); |
221 |
|
|
} |
222 |
|
|
} |
223 |
|
|
return shift @nt; |
224 |
|
|
} elsif ($s =~ s/^(\*+)\s*//) { |
225 |
|
|
push @nt, {type => HEADING_START_TOKEN, depth => length $1, |
226 |
|
|
line => $line, column => $column}; |
227 |
|
|
$column += $+[0] - $-[0]; |
228 |
|
|
$tokenize_text->(\$s); |
229 |
|
|
push @nt, {type => HEADING_END_TOKEN, |
230 |
|
|
line => $line, column => $column}; |
231 |
|
|
undef $continuous_line; |
232 |
|
|
return shift @nt; |
233 |
|
|
} elsif ($s =~ s/^([-=]+)\s*//) { |
234 |
|
|
push @nt, {type => LIST_START_TOKEN, depth => $1, |
235 |
|
|
line => $line, column => $column}; |
236 |
|
|
$column += $+[0] - $-[0]; |
237 |
|
|
$tokenize_text->(\$s); |
238 |
|
|
$continuous_line = 1; |
239 |
|
|
return shift @nt; |
240 |
|
|
} elsif ($s =~ s/^:([^:]*)//) { |
241 |
|
|
my $name = $1; |
242 |
|
|
if ($s eq '') { |
243 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => ':', |
244 |
|
|
line => $line, column => $column}; |
245 |
|
|
$column++; |
246 |
|
|
$tokenize_text->(\$name); |
247 |
|
|
} else { |
248 |
|
|
my $real_column = $column + 1 + length $name; |
249 |
|
|
push @nt, {type => LABELED_LIST_START_TOKEN, |
250 |
|
|
line => $line, column => $column}; |
251 |
|
|
$name =~ s/\A[\x09\x20]*//; |
252 |
|
|
$column += 1 + $+[0] - $-[0]; |
253 |
|
|
$name =~ s/[\x09\x20]+\z//; |
254 |
|
|
$tokenize_text->(\$s); |
255 |
|
|
$column = $real_column; |
256 |
|
|
push @nt, {type => LABELED_LIST_MIDDLE_TOKEN, |
257 |
|
|
line => $line, column => $column}; |
258 |
|
|
$column += $+[0] - $-[0] if $data =~ s/^:[\x09\x20]*//; |
259 |
|
|
$tokenize_text->(\$s); |
260 |
|
|
} |
261 |
|
|
$continuous_line = 1; |
262 |
|
|
return shift @nt; |
263 |
|
|
} elsif ($s =~ s/^(>+)//) { |
264 |
|
|
my $depth = length $1; |
265 |
|
|
if ($depth == 2 and $s =~ /^[0-9]/) { |
266 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => "\x0A", |
267 |
|
|
line => $line, column => $column} |
268 |
|
|
if $continuous_line; |
269 |
|
|
$s = '>>' . $s; |
270 |
|
|
$tokenize_text->(\$s); |
271 |
|
|
} else { |
272 |
|
|
push @nt, {type => QUOTATION_START_TOKEN, depth => $depth, |
273 |
|
|
line => $line, column => $column}; |
274 |
|
|
$column += $depth; |
275 |
|
|
$column += $+[0] - $-[0] if $s =~ s/^[\x09\x20]+//; |
276 |
|
|
if ($s =~ s/^\@\@[\x09\x20]*//) { |
277 |
|
|
push @nt, {type => EDITORIAL_NOTE_START_TOKEN, |
278 |
|
|
line => $line, column => $column}; |
279 |
|
|
$column += $+[0] - $-[0]; |
280 |
|
|
} elsif ($s =~ s/^;;[\x09\x20]*//) { |
281 |
|
|
push @nt, {type => COMMENT_PARAGRAPH_START_TOKEN, |
282 |
|
|
line => $line, column => $column}; |
283 |
|
|
$column += $+[0] - $-[0]; |
284 |
|
|
} |
285 |
|
|
$tokenize_text->(\$s); |
286 |
|
|
} |
287 |
|
|
$continuous_line = 1; |
288 |
|
|
return shift @nt; |
289 |
|
|
} elsif ($s =~ /\A\[(INS|DEL)(?>\(([^()\\]*)\))?\[[\x09\x20]*\z/) { |
290 |
|
|
undef $continuous_line; |
291 |
|
|
return {type => BLOCK_START_TAG_TOKEN, tag_name => $1, |
292 |
|
|
classes => $2, |
293 |
|
|
line => $line, column => $column}; |
294 |
|
|
} elsif ($s =~ /\A\[PRE(?>\(([^()\\]*)\))?\[[\x09\x20]*\z/) { |
295 |
|
|
undef $continuous_line; |
296 |
|
|
push @nt, {type => BLOCK_START_TAG_TOKEN, tag_name => 'PRE', |
297 |
|
|
classes => $1, |
298 |
|
|
line => $line, column => $column}; |
299 |
|
|
while (@s) { |
300 |
|
|
my $s = shift @s; |
301 |
|
|
($line, $column) = ($line + 1, 1); |
302 |
|
|
if ($s =~ /\A\]PRE\][\x09\x20]*\z/) { |
303 |
|
|
push @nt, {type => BLOCK_END_TAG_TOKEN, tag_name => 'PRE', |
304 |
|
|
line => $line, column => $column}; |
305 |
|
|
undef $continuous_line; |
306 |
|
|
break; |
307 |
|
|
} else { |
308 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => "\x0A", |
309 |
|
|
line => $line, column => $column} |
310 |
|
|
if $continuous_line; |
311 |
|
|
$tokenize_text->(\$s); |
312 |
|
|
$continuous_line = 1; |
313 |
|
|
} |
314 |
|
|
} |
315 |
|
|
return shift @nt; |
316 |
|
|
} elsif ($s =~ s/^\@\@[\x09\x20]*//) { |
317 |
|
|
push @nt, {type => EDITORIAL_NOTE_START_TOKEN, |
318 |
|
|
line => $line, column => $column}; |
319 |
|
|
$column += $+[0] - $-[0]; |
320 |
|
|
$tokenize_text->(\$s); |
321 |
|
|
$continuous_line = 1; |
322 |
|
|
return shift @nt; |
323 |
|
|
} elsif ($s =~ s/^;;[\x09\x20]*//) { |
324 |
|
|
push @nt, {type => COMMENT_PARAGRAPH_START_TOKEN, |
325 |
|
|
line => $line, column => $column}; |
326 |
|
|
$column += $+[0] - $-[0]; |
327 |
|
|
$tokenize_text->(\$s); |
328 |
|
|
undef $continuous_line; |
329 |
|
|
return shift @nt; |
330 |
|
|
} elsif ($s =~ /\A\](INS|DEL)\][\x09\x20]*\z/) { |
331 |
|
|
$continuous_line = 1; |
332 |
|
|
return {type => BLOCK_END_TAG_TOKEN, tag_name => $1, |
333 |
|
|
line => $line, column => $column}; |
334 |
|
|
} elsif ($s =~ /^,/) { |
335 |
|
|
push @nt, {type => TABLE_ROW_START_TOKEN, |
336 |
|
|
line => $line, column => $column}; |
337 |
|
|
while ($s =~ s/^,[\x09\x20]*//) { |
338 |
|
|
$column += $+[0] - $-[0]; |
339 |
|
|
my $cell; |
340 |
|
|
my $cell_quoted; |
341 |
|
|
my $column_quoted = $column; |
342 |
|
|
my $column_cell = $column; |
343 |
|
|
if ($s =~ s/^"//) { |
344 |
|
|
$s =~ s/^((?>[^"\\]|\\.)*)//; |
345 |
|
|
$cell_quoted = $1; |
346 |
|
|
$column += 1 + length $cell_quoted; |
347 |
|
|
$cell_quoted =~ tr/\\//d; |
348 |
|
|
$column++ if $s =~ s/\A\\\z//; |
349 |
|
|
$column++ if $s =~ s/^"//; |
350 |
|
|
} |
351 |
|
|
if ($s =~ s/^([^,]+)//) { |
352 |
|
|
$cell = $1; |
353 |
|
|
$column += length $cell; |
354 |
|
|
$cell =~ s/[\x09\x20]+\z//; |
355 |
|
|
} |
356 |
|
|
if (not defined $cell_quoted and defined $cell and |
357 |
|
|
$cell eq '==') { |
358 |
|
|
push @nt, {type => TABLE_COLSPAN_CELL_TOKEN, |
359 |
|
|
line => $line, column => $column_cell}; |
360 |
|
|
} else { |
361 |
|
|
push @nt, {type => TABLE_CELL_START_TOKEN, |
362 |
|
|
line => $line, |
363 |
|
|
column => defined $column_quoted ? $column_quoted: $column_cell}; |
364 |
|
|
my $real_column = $column; |
365 |
|
|
$column = $column_quoted + 1; |
366 |
|
|
$tokenize_text->(\$cell_quoted) if defined $cell_quoted; |
367 |
|
|
## NOTE: When a quoted-pair is used, column numbers |
368 |
|
|
## reported in this $tokenize_text call might be wrong. |
369 |
|
|
$column = $column_cell; |
370 |
|
|
$tokenize_text->(\$cell) if defined $cell; |
371 |
|
|
$column = $column_quoted; |
372 |
|
|
push @nt, {type => TABLE_CELL_END_TOKEN, |
373 |
|
|
line => $line, |
374 |
|
|
column => $column}; |
375 |
|
|
} |
376 |
|
|
} |
377 |
|
|
push @nt, {type => TABLE_ROW_END_TOKEN, |
378 |
|
|
line => $line, column => $column}; |
379 |
|
|
undef $continuous_line; |
380 |
|
|
return shift @nt; |
381 |
|
|
} elsif ($s eq '__IMAGE__') { |
382 |
|
|
my $image = $doc->create_element_ns ($NS_SW09, 'image'); |
383 |
|
|
$_->set_user_data (manakai_source_line => $line); |
384 |
|
|
$_->set_user_data (manakai_source_column => 1); |
385 |
|
|
$image->text_content (join "\x0A", $s, @s); |
386 |
|
|
($line, $column) = ($line + @s, 1); |
387 |
|
|
@s = (); |
388 |
|
|
$doc->document_element->append_child ($image); |
389 |
|
|
return {type => END_OF_FILE_TOKEN, |
390 |
|
|
line => $line, column => $column}; |
391 |
|
|
} else { |
392 |
|
|
push @nt, {type => CHARACTER_TOKEN, data => "\x0A", |
393 |
|
|
line => $line, column => $column} if $continuous_line; |
394 |
|
|
$tokenize_text->(\$s); |
395 |
|
|
$continuous_line = 1; |
396 |
|
|
return shift @nt; |
397 |
|
|
} |
398 |
|
|
}; # $get_next_token_body |
399 |
|
|
|
400 |
|
|
## NOTE: The "initial" mode. |
401 |
|
|
if (@s and $s[0] =~ /^#\?/) { |
402 |
|
|
## NOTE: "Parse a magic line". |
403 |
|
|
|
404 |
|
|
my $s = shift @s; |
405 |
|
|
|
406 |
|
|
## TODO:... |
407 |
|
|
|
408 |
|
|
$line = 2; |
409 |
|
|
$column = 0; |
410 |
|
|
} |
411 |
|
|
|
412 |
|
|
## NOTE: Switched to the "body" mode. |
413 |
|
|
$get_next_token->(); |
414 |
|
|
|
415 |
|
|
} # parse_char_string |
416 |
|
|
|
417 |
|
|
1; |