| 1 |
#!/usr/bin/perl |
| 2 |
use strict; |
| 3 |
|
| 4 |
|
| 5 |
my $entity_char = { |
| 6 |
AElig => "\x{00C6}", |
| 7 |
Aacute => "\x{00C1}", |
| 8 |
Acirc => "\x{00C2}", |
| 9 |
Agrave => "\x{00C0}", |
| 10 |
Alpha => "\x{0391}", |
| 11 |
Aring => "\x{00C5}", |
| 12 |
Atilde => "\x{00C3}", |
| 13 |
Auml => "\x{00C4}", |
| 14 |
Beta => "\x{0392}", |
| 15 |
Ccedil => "\x{00C7}", |
| 16 |
Chi => "\x{03A7}", |
| 17 |
Dagger => "\x{2021}", |
| 18 |
Delta => "\x{0394}", |
| 19 |
ETH => "\x{00D0}", |
| 20 |
Eacute => "\x{00C9}", |
| 21 |
Ecirc => "\x{00CA}", |
| 22 |
Egrave => "\x{00C8}", |
| 23 |
Epsilon => "\x{0395}", |
| 24 |
Eta => "\x{0397}", |
| 25 |
Euml => "\x{00CB}", |
| 26 |
Gamma => "\x{0393}", |
| 27 |
Iacute => "\x{00CD}", |
| 28 |
Icirc => "\x{00CE}", |
| 29 |
Igrave => "\x{00CC}", |
| 30 |
Iota => "\x{0399}", |
| 31 |
Iuml => "\x{00CF}", |
| 32 |
Kappa => "\x{039A}", |
| 33 |
Lambda => "\x{039B}", |
| 34 |
Mu => "\x{039C}", |
| 35 |
Ntilde => "\x{00D1}", |
| 36 |
Nu => "\x{039D}", |
| 37 |
OElig => "\x{0152}", |
| 38 |
Oacute => "\x{00D3}", |
| 39 |
Ocirc => "\x{00D4}", |
| 40 |
Ograve => "\x{00D2}", |
| 41 |
Omega => "\x{03A9}", |
| 42 |
Omicron => "\x{039F}", |
| 43 |
Oslash => "\x{00D8}", |
| 44 |
Otilde => "\x{00D5}", |
| 45 |
Ouml => "\x{00D6}", |
| 46 |
Phi => "\x{03A6}", |
| 47 |
Pi => "\x{03A0}", |
| 48 |
Prime => "\x{2033}", |
| 49 |
Psi => "\x{03A8}", |
| 50 |
Rho => "\x{03A1}", |
| 51 |
Scaron => "\x{0160}", |
| 52 |
Sigma => "\x{03A3}", |
| 53 |
THORN => "\x{00DE}", |
| 54 |
Tau => "\x{03A4}", |
| 55 |
Theta => "\x{0398}", |
| 56 |
Uacute => "\x{00DA}", |
| 57 |
Ucirc => "\x{00DB}", |
| 58 |
Ugrave => "\x{00D9}", |
| 59 |
Upsilon => "\x{03A5}", |
| 60 |
Uuml => "\x{00DC}", |
| 61 |
Xi => "\x{039E}", |
| 62 |
Yacute => "\x{00DD}", |
| 63 |
Yuml => "\x{0178}", |
| 64 |
Zeta => "\x{0396}", |
| 65 |
aacute => "\x{00E1}", |
| 66 |
acirc => "\x{00E2}", |
| 67 |
acute => "\x{00B4}", |
| 68 |
aelig => "\x{00E6}", |
| 69 |
agrave => "\x{00E0}", |
| 70 |
alefsym => "\x{2135}", |
| 71 |
alpha => "\x{03B1}", |
| 72 |
amp => "\x{0026}", |
| 73 |
AMP => "\x{0026}", |
| 74 |
and => "\x{2227}", |
| 75 |
ang => "\x{2220}", |
| 76 |
apos => "\x{0027}", |
| 77 |
aring => "\x{00E5}", |
| 78 |
asymp => "\x{2248}", |
| 79 |
atilde => "\x{00E3}", |
| 80 |
auml => "\x{00E4}", |
| 81 |
bdquo => "\x{201E}", |
| 82 |
beta => "\x{03B2}", |
| 83 |
brvbar => "\x{00A6}", |
| 84 |
bull => "\x{2022}", |
| 85 |
cap => "\x{2229}", |
| 86 |
ccedil => "\x{00E7}", |
| 87 |
cedil => "\x{00B8}", |
| 88 |
cent => "\x{00A2}", |
| 89 |
chi => "\x{03C7}", |
| 90 |
circ => "\x{02C6}", |
| 91 |
clubs => "\x{2663}", |
| 92 |
cong => "\x{2245}", |
| 93 |
copy => "\x{00A9}", |
| 94 |
COPY => "\x{00A9}", |
| 95 |
crarr => "\x{21B5}", |
| 96 |
cup => "\x{222A}", |
| 97 |
curren => "\x{00A4}", |
| 98 |
dArr => "\x{21D3}", |
| 99 |
dagger => "\x{2020}", |
| 100 |
darr => "\x{2193}", |
| 101 |
deg => "\x{00B0}", |
| 102 |
delta => "\x{03B4}", |
| 103 |
diams => "\x{2666}", |
| 104 |
divide => "\x{00F7}", |
| 105 |
eacute => "\x{00E9}", |
| 106 |
ecirc => "\x{00EA}", |
| 107 |
egrave => "\x{00E8}", |
| 108 |
empty => "\x{2205}", |
| 109 |
emsp => "\x{2003}", |
| 110 |
ensp => "\x{2002}", |
| 111 |
epsilon => "\x{03B5}", |
| 112 |
equiv => "\x{2261}", |
| 113 |
eta => "\x{03B7}", |
| 114 |
eth => "\x{00F0}", |
| 115 |
euml => "\x{00EB}", |
| 116 |
euro => "\x{20AC}", |
| 117 |
exist => "\x{2203}", |
| 118 |
fnof => "\x{0192}", |
| 119 |
forall => "\x{2200}", |
| 120 |
frac12 => "\x{00BD}", |
| 121 |
frac14 => "\x{00BC}", |
| 122 |
frac34 => "\x{00BE}", |
| 123 |
frasl => "\x{2044}", |
| 124 |
gamma => "\x{03B3}", |
| 125 |
ge => "\x{2265}", |
| 126 |
gt => "\x{003E}", |
| 127 |
GT => "\x{003E}", |
| 128 |
hArr => "\x{21D4}", |
| 129 |
harr => "\x{2194}", |
| 130 |
hearts => "\x{2665}", |
| 131 |
hellip => "\x{2026}", |
| 132 |
iacute => "\x{00ED}", |
| 133 |
icirc => "\x{00EE}", |
| 134 |
iexcl => "\x{00A1}", |
| 135 |
igrave => "\x{00EC}", |
| 136 |
image => "\x{2111}", |
| 137 |
infin => "\x{221E}", |
| 138 |
int => "\x{222B}", |
| 139 |
iota => "\x{03B9}", |
| 140 |
iquest => "\x{00BF}", |
| 141 |
isin => "\x{2208}", |
| 142 |
iuml => "\x{00EF}", |
| 143 |
kappa => "\x{03BA}", |
| 144 |
lArr => "\x{21D0}", |
| 145 |
lambda => "\x{03BB}", |
| 146 |
lang => "\x{2329}", |
| 147 |
laquo => "\x{00AB}", |
| 148 |
larr => "\x{2190}", |
| 149 |
lceil => "\x{2308}", |
| 150 |
ldquo => "\x{201C}", |
| 151 |
le => "\x{2264}", |
| 152 |
lfloor => "\x{230A}", |
| 153 |
lowast => "\x{2217}", |
| 154 |
loz => "\x{25CA}", |
| 155 |
lrm => "\x{200E}", |
| 156 |
lsaquo => "\x{2039}", |
| 157 |
lsquo => "\x{2018}", |
| 158 |
lt => "\x{003C}", |
| 159 |
LT => "\x{003C}", |
| 160 |
macr => "\x{00AF}", |
| 161 |
mdash => "\x{2014}", |
| 162 |
micro => "\x{00B5}", |
| 163 |
middot => "\x{00B7}", |
| 164 |
minus => "\x{2212}", |
| 165 |
mu => "\x{03BC}", |
| 166 |
nabla => "\x{2207}", |
| 167 |
nbsp => "\x{00A0}", |
| 168 |
ndash => "\x{2013}", |
| 169 |
ne => "\x{2260}", |
| 170 |
ni => "\x{220B}", |
| 171 |
not => "\x{00AC}", |
| 172 |
notin => "\x{2209}", |
| 173 |
nsub => "\x{2284}", |
| 174 |
ntilde => "\x{00F1}", |
| 175 |
nu => "\x{03BD}", |
| 176 |
oacute => "\x{00F3}", |
| 177 |
ocirc => "\x{00F4}", |
| 178 |
oelig => "\x{0153}", |
| 179 |
ograve => "\x{00F2}", |
| 180 |
oline => "\x{203E}", |
| 181 |
omega => "\x{03C9}", |
| 182 |
omicron => "\x{03BF}", |
| 183 |
oplus => "\x{2295}", |
| 184 |
or => "\x{2228}", |
| 185 |
ordf => "\x{00AA}", |
| 186 |
ordm => "\x{00BA}", |
| 187 |
oslash => "\x{00F8}", |
| 188 |
otilde => "\x{00F5}", |
| 189 |
otimes => "\x{2297}", |
| 190 |
ouml => "\x{00F6}", |
| 191 |
para => "\x{00B6}", |
| 192 |
part => "\x{2202}", |
| 193 |
permil => "\x{2030}", |
| 194 |
perp => "\x{22A5}", |
| 195 |
phi => "\x{03C6}", |
| 196 |
pi => "\x{03C0}", |
| 197 |
piv => "\x{03D6}", |
| 198 |
plusmn => "\x{00B1}", |
| 199 |
pound => "\x{00A3}", |
| 200 |
prime => "\x{2032}", |
| 201 |
prod => "\x{220F}", |
| 202 |
prop => "\x{221D}", |
| 203 |
psi => "\x{03C8}", |
| 204 |
quot => "\x{0022}", |
| 205 |
QUOT => "\x{0022}", |
| 206 |
rArr => "\x{21D2}", |
| 207 |
radic => "\x{221A}", |
| 208 |
rang => "\x{232A}", |
| 209 |
raquo => "\x{00BB}", |
| 210 |
rarr => "\x{2192}", |
| 211 |
rceil => "\x{2309}", |
| 212 |
rdquo => "\x{201D}", |
| 213 |
real => "\x{211C}", |
| 214 |
reg => "\x{00AE}", |
| 215 |
REG => "\x{00AE}", |
| 216 |
rfloor => "\x{230B}", |
| 217 |
rho => "\x{03C1}", |
| 218 |
rlm => "\x{200F}", |
| 219 |
rsaquo => "\x{203A}", |
| 220 |
rsquo => "\x{2019}", |
| 221 |
sbquo => "\x{201A}", |
| 222 |
scaron => "\x{0161}", |
| 223 |
sdot => "\x{22C5}", |
| 224 |
sect => "\x{00A7}", |
| 225 |
shy => "\x{00AD}", |
| 226 |
sigma => "\x{03C3}", |
| 227 |
sigmaf => "\x{03C2}", |
| 228 |
sim => "\x{223C}", |
| 229 |
spades => "\x{2660}", |
| 230 |
sub => "\x{2282}", |
| 231 |
sube => "\x{2286}", |
| 232 |
sum => "\x{2211}", |
| 233 |
sup => "\x{2283}", |
| 234 |
sup1 => "\x{00B9}", |
| 235 |
sup2 => "\x{00B2}", |
| 236 |
sup3 => "\x{00B3}", |
| 237 |
supe => "\x{2287}", |
| 238 |
szlig => "\x{00DF}", |
| 239 |
tau => "\x{03C4}", |
| 240 |
there4 => "\x{2234}", |
| 241 |
theta => "\x{03B8}", |
| 242 |
thetasym => "\x{03D1}", |
| 243 |
thinsp => "\x{2009}", |
| 244 |
thorn => "\x{00FE}", |
| 245 |
tilde => "\x{02DC}", |
| 246 |
times => "\x{00D7}", |
| 247 |
trade => "\x{2122}", |
| 248 |
uArr => "\x{21D1}", |
| 249 |
uacute => "\x{00FA}", |
| 250 |
uarr => "\x{2191}", |
| 251 |
ucirc => "\x{00FB}", |
| 252 |
ugrave => "\x{00F9}", |
| 253 |
uml => "\x{00A8}", |
| 254 |
upsih => "\x{03D2}", |
| 255 |
upsilon => "\x{03C5}", |
| 256 |
uuml => "\x{00FC}", |
| 257 |
weierp => "\x{2118}", |
| 258 |
xi => "\x{03BE}", |
| 259 |
yacute => "\x{00FD}", |
| 260 |
yen => "\x{00A5}", |
| 261 |
yuml => "\x{00FF}", |
| 262 |
zeta => "\x{03B6}", |
| 263 |
zwj => "\x{200D}", |
| 264 |
zwnj => "\x{200C}", |
| 265 |
}; # $entity_char |
| 266 |
|
| 267 |
my $Entity = {}; |
| 268 |
|
| 269 |
our $target = shift; |
| 270 |
our $code = sub { |
| 271 |
my ($entity, $file_name) = @_; |
| 272 |
|
| 273 |
my $htentity = {}; |
| 274 |
pos $entity->{body} = 0; |
| 275 |
while ($entity->{body} =~ /(&#?[A-Za-z0-9_.:-]+;?)/gc) { |
| 276 |
my $ent = $1; |
| 277 |
if ($ent =~ /^&([A-Za-z0-9]+);$/ and $entity_char->{$1}) { |
| 278 |
$htentity->{'#defined;'} = 1; |
| 279 |
} elsif ($ent =~ /^&([A-Za-z0-9]+)$/ and $entity_char->{$1}) { |
| 280 |
$htentity->{'#defined'} = 1; |
| 281 |
} elsif ($ent =~ /^&#/) { |
| 282 |
if ($ent =~ /^&#x/) { |
| 283 |
$htentity->{$ent =~ /;$/ ? '#hex;' : '#hex'} = 1; |
| 284 |
} else { |
| 285 |
$htentity->{$ent =~ /;$/ ? '#num;' : '#num'} = 1; |
| 286 |
} |
| 287 |
} elsif ($ent =~ /;$/) { |
| 288 |
$htentity->{'#undef;'} = 1; |
| 289 |
} else { |
| 290 |
$htentity->{'#undef'} = 1; |
| 291 |
} |
| 292 |
$htentity->{$ent} = 1; |
| 293 |
} |
| 294 |
|
| 295 |
for (keys %$htentity) { |
| 296 |
$Entity->{$_}++; |
| 297 |
} |
| 298 |
}; |
| 299 |
|
| 300 |
require 'foreach.pl'; |
| 301 |
|
| 302 |
for (sort {$a cmp $b} keys %$Entity) { |
| 303 |
print $_, "\t", $Entity->{$_}, "\n"; |
| 304 |
} |
| 305 |
|
| 306 |
=head1 AUTHOR |
| 307 |
|
| 308 |
Wakaba <w@suika.fam.cx>. |
| 309 |
|
| 310 |
=head1 LICENSE |
| 311 |
|
| 312 |
Copyright 2007 Wakaba <w@suika.fam.cx> |
| 313 |
|
| 314 |
This library is free software; you can redistribute it |
| 315 |
and/or modify it under the same terms as Perl itself. |
| 316 |
|
| 317 |
=cut |
| 318 |
|
| 319 |
1; |
| 320 |
## $Date: 2007/06/02 12:12:28 $ |
| 321 |
|