1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
|
4 |
|
5 |
my $entity_char = { |
6 |
AElig => "\x{00C6}", |
7 |
Aacute => "\x{00C1}", |
8 |
Acirc => "\x{00C2}", |
9 |
Agrave => "\x{00C0}", |
10 |
Alpha => "\x{0391}", |
11 |
Aring => "\x{00C5}", |
12 |
Atilde => "\x{00C3}", |
13 |
Auml => "\x{00C4}", |
14 |
Beta => "\x{0392}", |
15 |
Ccedil => "\x{00C7}", |
16 |
Chi => "\x{03A7}", |
17 |
Dagger => "\x{2021}", |
18 |
Delta => "\x{0394}", |
19 |
ETH => "\x{00D0}", |
20 |
Eacute => "\x{00C9}", |
21 |
Ecirc => "\x{00CA}", |
22 |
Egrave => "\x{00C8}", |
23 |
Epsilon => "\x{0395}", |
24 |
Eta => "\x{0397}", |
25 |
Euml => "\x{00CB}", |
26 |
Gamma => "\x{0393}", |
27 |
Iacute => "\x{00CD}", |
28 |
Icirc => "\x{00CE}", |
29 |
Igrave => "\x{00CC}", |
30 |
Iota => "\x{0399}", |
31 |
Iuml => "\x{00CF}", |
32 |
Kappa => "\x{039A}", |
33 |
Lambda => "\x{039B}", |
34 |
Mu => "\x{039C}", |
35 |
Ntilde => "\x{00D1}", |
36 |
Nu => "\x{039D}", |
37 |
OElig => "\x{0152}", |
38 |
Oacute => "\x{00D3}", |
39 |
Ocirc => "\x{00D4}", |
40 |
Ograve => "\x{00D2}", |
41 |
Omega => "\x{03A9}", |
42 |
Omicron => "\x{039F}", |
43 |
Oslash => "\x{00D8}", |
44 |
Otilde => "\x{00D5}", |
45 |
Ouml => "\x{00D6}", |
46 |
Phi => "\x{03A6}", |
47 |
Pi => "\x{03A0}", |
48 |
Prime => "\x{2033}", |
49 |
Psi => "\x{03A8}", |
50 |
Rho => "\x{03A1}", |
51 |
Scaron => "\x{0160}", |
52 |
Sigma => "\x{03A3}", |
53 |
THORN => "\x{00DE}", |
54 |
Tau => "\x{03A4}", |
55 |
Theta => "\x{0398}", |
56 |
Uacute => "\x{00DA}", |
57 |
Ucirc => "\x{00DB}", |
58 |
Ugrave => "\x{00D9}", |
59 |
Upsilon => "\x{03A5}", |
60 |
Uuml => "\x{00DC}", |
61 |
Xi => "\x{039E}", |
62 |
Yacute => "\x{00DD}", |
63 |
Yuml => "\x{0178}", |
64 |
Zeta => "\x{0396}", |
65 |
aacute => "\x{00E1}", |
66 |
acirc => "\x{00E2}", |
67 |
acute => "\x{00B4}", |
68 |
aelig => "\x{00E6}", |
69 |
agrave => "\x{00E0}", |
70 |
alefsym => "\x{2135}", |
71 |
alpha => "\x{03B1}", |
72 |
amp => "\x{0026}", |
73 |
AMP => "\x{0026}", |
74 |
and => "\x{2227}", |
75 |
ang => "\x{2220}", |
76 |
apos => "\x{0027}", |
77 |
aring => "\x{00E5}", |
78 |
asymp => "\x{2248}", |
79 |
atilde => "\x{00E3}", |
80 |
auml => "\x{00E4}", |
81 |
bdquo => "\x{201E}", |
82 |
beta => "\x{03B2}", |
83 |
brvbar => "\x{00A6}", |
84 |
bull => "\x{2022}", |
85 |
cap => "\x{2229}", |
86 |
ccedil => "\x{00E7}", |
87 |
cedil => "\x{00B8}", |
88 |
cent => "\x{00A2}", |
89 |
chi => "\x{03C7}", |
90 |
circ => "\x{02C6}", |
91 |
clubs => "\x{2663}", |
92 |
cong => "\x{2245}", |
93 |
copy => "\x{00A9}", |
94 |
COPY => "\x{00A9}", |
95 |
crarr => "\x{21B5}", |
96 |
cup => "\x{222A}", |
97 |
curren => "\x{00A4}", |
98 |
dArr => "\x{21D3}", |
99 |
dagger => "\x{2020}", |
100 |
darr => "\x{2193}", |
101 |
deg => "\x{00B0}", |
102 |
delta => "\x{03B4}", |
103 |
diams => "\x{2666}", |
104 |
divide => "\x{00F7}", |
105 |
eacute => "\x{00E9}", |
106 |
ecirc => "\x{00EA}", |
107 |
egrave => "\x{00E8}", |
108 |
empty => "\x{2205}", |
109 |
emsp => "\x{2003}", |
110 |
ensp => "\x{2002}", |
111 |
epsilon => "\x{03B5}", |
112 |
equiv => "\x{2261}", |
113 |
eta => "\x{03B7}", |
114 |
eth => "\x{00F0}", |
115 |
euml => "\x{00EB}", |
116 |
euro => "\x{20AC}", |
117 |
exist => "\x{2203}", |
118 |
fnof => "\x{0192}", |
119 |
forall => "\x{2200}", |
120 |
frac12 => "\x{00BD}", |
121 |
frac14 => "\x{00BC}", |
122 |
frac34 => "\x{00BE}", |
123 |
frasl => "\x{2044}", |
124 |
gamma => "\x{03B3}", |
125 |
ge => "\x{2265}", |
126 |
gt => "\x{003E}", |
127 |
GT => "\x{003E}", |
128 |
hArr => "\x{21D4}", |
129 |
harr => "\x{2194}", |
130 |
hearts => "\x{2665}", |
131 |
hellip => "\x{2026}", |
132 |
iacute => "\x{00ED}", |
133 |
icirc => "\x{00EE}", |
134 |
iexcl => "\x{00A1}", |
135 |
igrave => "\x{00EC}", |
136 |
image => "\x{2111}", |
137 |
infin => "\x{221E}", |
138 |
int => "\x{222B}", |
139 |
iota => "\x{03B9}", |
140 |
iquest => "\x{00BF}", |
141 |
isin => "\x{2208}", |
142 |
iuml => "\x{00EF}", |
143 |
kappa => "\x{03BA}", |
144 |
lArr => "\x{21D0}", |
145 |
lambda => "\x{03BB}", |
146 |
lang => "\x{2329}", |
147 |
laquo => "\x{00AB}", |
148 |
larr => "\x{2190}", |
149 |
lceil => "\x{2308}", |
150 |
ldquo => "\x{201C}", |
151 |
le => "\x{2264}", |
152 |
lfloor => "\x{230A}", |
153 |
lowast => "\x{2217}", |
154 |
loz => "\x{25CA}", |
155 |
lrm => "\x{200E}", |
156 |
lsaquo => "\x{2039}", |
157 |
lsquo => "\x{2018}", |
158 |
lt => "\x{003C}", |
159 |
LT => "\x{003C}", |
160 |
macr => "\x{00AF}", |
161 |
mdash => "\x{2014}", |
162 |
micro => "\x{00B5}", |
163 |
middot => "\x{00B7}", |
164 |
minus => "\x{2212}", |
165 |
mu => "\x{03BC}", |
166 |
nabla => "\x{2207}", |
167 |
nbsp => "\x{00A0}", |
168 |
ndash => "\x{2013}", |
169 |
ne => "\x{2260}", |
170 |
ni => "\x{220B}", |
171 |
not => "\x{00AC}", |
172 |
notin => "\x{2209}", |
173 |
nsub => "\x{2284}", |
174 |
ntilde => "\x{00F1}", |
175 |
nu => "\x{03BD}", |
176 |
oacute => "\x{00F3}", |
177 |
ocirc => "\x{00F4}", |
178 |
oelig => "\x{0153}", |
179 |
ograve => "\x{00F2}", |
180 |
oline => "\x{203E}", |
181 |
omega => "\x{03C9}", |
182 |
omicron => "\x{03BF}", |
183 |
oplus => "\x{2295}", |
184 |
or => "\x{2228}", |
185 |
ordf => "\x{00AA}", |
186 |
ordm => "\x{00BA}", |
187 |
oslash => "\x{00F8}", |
188 |
otilde => "\x{00F5}", |
189 |
otimes => "\x{2297}", |
190 |
ouml => "\x{00F6}", |
191 |
para => "\x{00B6}", |
192 |
part => "\x{2202}", |
193 |
permil => "\x{2030}", |
194 |
perp => "\x{22A5}", |
195 |
phi => "\x{03C6}", |
196 |
pi => "\x{03C0}", |
197 |
piv => "\x{03D6}", |
198 |
plusmn => "\x{00B1}", |
199 |
pound => "\x{00A3}", |
200 |
prime => "\x{2032}", |
201 |
prod => "\x{220F}", |
202 |
prop => "\x{221D}", |
203 |
psi => "\x{03C8}", |
204 |
quot => "\x{0022}", |
205 |
QUOT => "\x{0022}", |
206 |
rArr => "\x{21D2}", |
207 |
radic => "\x{221A}", |
208 |
rang => "\x{232A}", |
209 |
raquo => "\x{00BB}", |
210 |
rarr => "\x{2192}", |
211 |
rceil => "\x{2309}", |
212 |
rdquo => "\x{201D}", |
213 |
real => "\x{211C}", |
214 |
reg => "\x{00AE}", |
215 |
REG => "\x{00AE}", |
216 |
rfloor => "\x{230B}", |
217 |
rho => "\x{03C1}", |
218 |
rlm => "\x{200F}", |
219 |
rsaquo => "\x{203A}", |
220 |
rsquo => "\x{2019}", |
221 |
sbquo => "\x{201A}", |
222 |
scaron => "\x{0161}", |
223 |
sdot => "\x{22C5}", |
224 |
sect => "\x{00A7}", |
225 |
shy => "\x{00AD}", |
226 |
sigma => "\x{03C3}", |
227 |
sigmaf => "\x{03C2}", |
228 |
sim => "\x{223C}", |
229 |
spades => "\x{2660}", |
230 |
sub => "\x{2282}", |
231 |
sube => "\x{2286}", |
232 |
sum => "\x{2211}", |
233 |
sup => "\x{2283}", |
234 |
sup1 => "\x{00B9}", |
235 |
sup2 => "\x{00B2}", |
236 |
sup3 => "\x{00B3}", |
237 |
supe => "\x{2287}", |
238 |
szlig => "\x{00DF}", |
239 |
tau => "\x{03C4}", |
240 |
there4 => "\x{2234}", |
241 |
theta => "\x{03B8}", |
242 |
thetasym => "\x{03D1}", |
243 |
thinsp => "\x{2009}", |
244 |
thorn => "\x{00FE}", |
245 |
tilde => "\x{02DC}", |
246 |
times => "\x{00D7}", |
247 |
trade => "\x{2122}", |
248 |
uArr => "\x{21D1}", |
249 |
uacute => "\x{00FA}", |
250 |
uarr => "\x{2191}", |
251 |
ucirc => "\x{00FB}", |
252 |
ugrave => "\x{00F9}", |
253 |
uml => "\x{00A8}", |
254 |
upsih => "\x{03D2}", |
255 |
upsilon => "\x{03C5}", |
256 |
uuml => "\x{00FC}", |
257 |
weierp => "\x{2118}", |
258 |
xi => "\x{03BE}", |
259 |
yacute => "\x{00FD}", |
260 |
yen => "\x{00A5}", |
261 |
yuml => "\x{00FF}", |
262 |
zeta => "\x{03B6}", |
263 |
zwj => "\x{200D}", |
264 |
zwnj => "\x{200C}", |
265 |
}; # $entity_char |
266 |
|
267 |
my $Entity = {}; |
268 |
|
269 |
our $target = shift; |
270 |
our $code = sub { |
271 |
my ($entity, $file_name) = @_; |
272 |
|
273 |
my $htentity = {}; |
274 |
pos $entity->{body} = 0; |
275 |
while ($entity->{body} =~ /(&#?[A-Za-z0-9_.:-]+;?)/gc) { |
276 |
my $ent = $1; |
277 |
if ($ent =~ /^&([A-Za-z0-9]+);$/ and $entity_char->{$1}) { |
278 |
$htentity->{'#defined;'} = 1; |
279 |
} elsif ($ent =~ /^&([A-Za-z0-9]+)$/ and $entity_char->{$1}) { |
280 |
$htentity->{'#defined'} = 1; |
281 |
} elsif ($ent =~ /^&#/) { |
282 |
if ($ent =~ /^&#x/) { |
283 |
$htentity->{$ent =~ /;$/ ? '#hex;' : '#hex'} = 1; |
284 |
} else { |
285 |
$htentity->{$ent =~ /;$/ ? '#num;' : '#num'} = 1; |
286 |
} |
287 |
} elsif ($ent =~ /;$/) { |
288 |
$htentity->{'#undef;'} = 1; |
289 |
} else { |
290 |
$htentity->{'#undef'} = 1; |
291 |
} |
292 |
$htentity->{$ent} = 1; |
293 |
} |
294 |
|
295 |
for (keys %$htentity) { |
296 |
$Entity->{$_}++; |
297 |
} |
298 |
}; |
299 |
|
300 |
require 'foreach.pl'; |
301 |
|
302 |
for (sort {$a cmp $b} keys %$Entity) { |
303 |
print $_, "\t", $Entity->{$_}, "\n"; |
304 |
} |
305 |
|
306 |
=head1 AUTHOR |
307 |
|
308 |
Wakaba <w@suika.fam.cx>. |
309 |
|
310 |
=head1 LICENSE |
311 |
|
312 |
Copyright 2007 Wakaba <w@suika.fam.cx> |
313 |
|
314 |
This library is free software; you can redistribute it |
315 |
and/or modify it under the same terms as Perl itself. |
316 |
|
317 |
=cut |
318 |
|
319 |
1; |
320 |
## $Date: 2007/06/02 12:12:28 $ |
321 |
|