#!/usr/bin/perl
use strict;

my $waiting_file_name = 'data/waiting.dat';
my $done_file_name = 'data/done.dat';
my $error_file_name = 'data/error.dat';
my $lock_file_name = 'data/lock';
my $temp_file_name = 'data/temp';
my $data_dir_name = 'data/';
my $data_num_file_name = 'data/next';

my $UA = q[get.pl];
my @WGET = (qw/wget/, '-s', '-U' => $UA, '-t' => 5, '-Q' => 10_000_000);

use lib qw[/home/httpd/html/www/markup/html/whatpm/
  /home/wakaba/work/manakai/lib
];

use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/;
use Whatpm::HTML;
use Message::URI::URIReference;

my $waiting;
my $done;
my $error;

my $uri;
use Fcntl ':flock';
use Data::Dumper;
{
  open my $lock_file, '>', $lock_file_name or die "$0: $lock_file_name: $!";
  flock $lock_file, LOCK_EX;

  my $waiting_list = do $waiting_file_name;
  for (@$waiting_list) {
    $waiting->{$_} = 1;
  }
  $done = do $done_file_name;
  $error = do $error_file_name;

  U: {
    my $next_uri = $waiting_list->[rand @$waiting_list];
    if (defined $next_uri and not rand 1 < 0.005) {
      redo U if $done->{$next_uri};
      $error->{$next_uri} = 1;
      delete $waiting->{$next_uri};
    } else {
      #$next_uri = q<http://rd.yahoo.co.jp/blog/random01/?http://blogs.yahoo.co.jp/FRONT/randomblog.html>;
      my @kana = qw(あ い う え お か き く け こ さ し す せ そ た ち つ て と な に ぬ ね の は ひ ふ へ ほ ま み む め も や ゆ よ ら り る れ ろ わ を ん が ぎ ぐ げ ご ざ じ ず ぜ ぞ だ ぢ づ で ど ば び ぶ べ ぼ ぱ ぴ ぷ ぺ ぽ ゃ ゅ ょ っ ぁ ぃ ぅ ぇ ぉ ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ ソ タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ヲ ン ガ ギ グ ゲ ゴ ザ ジ ズ ゼ ゾ ダ ヂ ヅ デ ド バ ビ ブ ベ ボ パ ピ プ ペ ポ ャ ュ ョ ッ ァ ィ ゥ ェ ォ);
      my $word = $kana[rand @kana].$kana[rand @kana].$kana[rand @kana];
      $next_uri = qq<http://www.google.co.jp/search?q=$word&btnG=%E6%A4%9C%E7%B4%A2&hl=ja>;
      $next_uri =~ s/([\x80-\xFF])/sprintf '%%%02X', ord $1/ge;
    }
    $uri = $next_uri;
  } # U

  open my $file, '>', $waiting_file_name or die "$0: $waiting_file_name: $!";
  print $file Dumper ([keys %$waiting]);
  open $file, '>', $done_file_name or die "$0: $done_file_name: $!";
  print $file Dumper ($done);
  open $file, '>', $error_file_name or die "$0: $error_file_name: $!";
  print $file Dumper ($error);
}

print STDERR qq[<$uri> -> "$temp_file_name"\n];

system @WGET, -O => $temp_file_name, $uri;

my %tag;
my $no_error;
if (-s $temp_file_name) {
  my $entity = {};

  ## Very simplified version of HTTP header processing
  open my $file, '<', $temp_file_name or die "$0: $temp_file_name: $!";
  my $line = <$file>;
  if (defined $line and $line =~ m!^(\S+)\s+(\S+)\s+(.*)!) {
    $entity->{protocol_version} = $1;
    $entity->{status_code} = $2;
    $entity->{status_phrase} = $3;
    $line = <$file>;
  }
  
  while (defined $line and $line !~ /^$/) {
    $line =~ tr/\x0D\x0A//d;
    if ($line =~ s/^([^:]+)://) {
      push @{$entity->{field}->{lc $1} ||= []}, $line;
    }
    $line = <$file>;
  }
  
  $line = <$file> if defined $line and $line =~ /^$/;
  
  $entity->{body} = '';
  while (defined $line) {
    $entity->{body} .= $line;
    $line = <$file>;
  }
  
  my $ct = 'text/html';
  if ($entity->{field}->{'content-type'}) {
    $ct = $entity->{field}->{'content-type'}->[0];
  } else {
    $tag{content_type_none} = 1;
  }

  CT: {
    for my $ct_pattern (
      qr[application/(?>atom\+xml|download|mathml\+xml|ms\.(?>excel|powerpoint)|msword|
                      octet-stream|pdf|rdf\+xml|x-download|xml|zip)]ix,
      qr[application/(?>atom\+xml|rdf\+xml|xml)\s*;\s*charset="?[0-9A-Za-z_.+-]+"?]ix,
      qr[audio/(?>mpeg)]i,
      qr[image/(?>bmp|gif|jpeg|png|svg\+xml|x-icon)]i,
      qr[text/(?>css|html|javascript|plain|x[ms]l)]i,
      qr[text/(?>css|html|javascript|plain|x[ms]l)\s*;\s*charset="?[0-9A-Za-z_.+-]+"?]i,
      qr[video/(?>mpeg|quicktime)]i,
    ) {
      if ($ct =~ /^\s*$ct_pattern\s*$/) {
        last CT;
      }
    }
    $tag{'content_type:' . $ct} = 1;
  } # CT

  if ($ct =~ /html/i or
      ($ct =~ /xml/i and $entity->{body} =~ /html/ and
       $entity->{body} !~ /<(?>rss|RDF|rdf|feed)/)) {
    my $charset;
    if ($ct =~ /\bcharset\s*=\s*(\S+)/i) {
      $charset = $1;
      $charset =~ tr/"//d;
    } elsif ($entity->{body} =~ /^[^>e]+encoding\s*=\s*"([^\s"]+)"/) {
      $charset = $1;
    } elsif ($entity->{body} =~ /\bcharset\s*=\s*([^\s"]+)/i) {
      $charset = $1;
    }

    if (defined $charset and not {
      'euc-jp' => 1,
      'iso-2022-jp' => 1,
      'iso-8859-1' => 1,
      none => 1,
      shift_jis => 1,
      'us-ascii' => 1,
      'utf-8' => 1,
      'utf-16' => 1,
      'utf-16be' => 1,
      'utf-16le' => 1,
      'windows-1252' => 1,
      'windows-31j' => 1,
      'x-euc-jp' => 1,
    }->{lc $charset}) {
      $tag{'charset_unknown:' . lc $charset} = 1;
    }

    if ($charset =~ /shift|sjis|ms932|cp9[34]2|windows-?31j/i) {
      $charset = 'shift_jis';
    } elsif ($charset =~ /euc|ujis/i) {
      $charset = 'euc-jp';
    } elsif ($charset eq '0' or $charset =~ /none|unknown/i) {
      undef $charset;
    }

    my $decoder = $charset
        ? Encode::find_encoding ($charset)
        : Encode::Guess->guess ($entity->{body});
    unless (ref $decoder) {
      $decoder = Encode::find_encoding ('shiftjis');
    }
    if (ref $decoder and 
        $decoder->name =~ /utf-16/i and
        $entity->{body} !~ /^(?>\xFE\xFF|\xFF\xFE|\x00<|<\x00)/) {
      $decoder = Encode::find_encoding ('shiftjis');
    }
    my $data = $decoder->decode ($entity->{body}, 0);
    tokenize ($data);
  }
  
  if (keys %tag) {
    my $index_file_name;
    my $data_file_name;
    open my $file, '<', $data_num_file_name or die "$0: $data_num_file_name: $!";
    my $num = <$file> + 1;
    N: {
      unless (-d $data_dir_name . int ($num / 1000)) {
        mkdir $data_dir_name . int ($num / 1000);
      }
      $data_file_name = $data_dir_name . int ($num / 1000) . '/' . ($num % 1000);
      $index_file_name = $data_dir_name . int ($num / 1000) . '/index.dat';
      if (-f $data_file_name) {
        $num++;
        redo N;
      }
    } # N
    open $file, '>', $data_num_file_name or die "$0: $data_num_file_name: $!";
    print $file $num, "\n";

    print STDERR qq["$temp_file_name" -> "$data_file_name"\n];
    rename $temp_file_name => $data_file_name
        or die "$0: $temp_file_name -> $data_file_name: $!";
    open my $index_file, '>>', $index_file_name
        or die "$0: $index_file_name: $!";
    print $index_file $num, "\t", $uri, "\t", scalar time, "\t",
        join (";", keys %tag), "\n";
  }
  
  $no_error = 1;
}

if ($no_error) {
  delete $error->{$uri};
  delete $waiting->{$uri};
  $done->{$uri} = 1;
} else {
  delete $error->{$uri};
  delete $done->{$uri};
  $error->{$uri} = 1;
}
{
  open my $lock_file, '>', $lock_file_name or die "$0: $lock_file_name: $!";
  flock $lock_file, LOCK_EX;

  open my $file, '>', $waiting_file_name or die "$0: $waiting_file_name: $!";
  print $file Dumper ([keys %$waiting]);
  open $file, '>', $done_file_name or die "$0: $done_file_name: $!";
  print $file Dumper ($done);
  open $file, '>', $error_file_name or die "$0: $error_file_name: $!";
  print $file Dumper ($error);
}

my $tag_info;
my $dom;
BEGIN {
  my $target = qr/(?>content|main|replace)/i;

  $tag_info
    = {
       a => {
             accesskey => 1,
             charset => 0, class => 1,
             directkey => 0,
             href => 1, hreflang => 1,
             id => 1, ijam => 0, media => 0, name => 1, 
             onclick => 1, onfocus => 1, onkeypress => 1,
             onmouseout => 1, onmouseover => 1,
             rel => 'tag', rev => 'tag',
             shape => 'tag', style => 1,
             tabindex => 'tag', target => $target, title => 1, type => 1},
       abbr => {'>' => 'tag', title => 1},
       acronym => {'>' => 'tag', title => 1},
       address => {class => 1, id => 1, style => 1},
       applet => {align => 1, name => 1},
       area => {alt => 1, charset => 0, coords => 1,
                href => 1, hreflang => 1, media => 0, name => 1,
                rel => 'tag', rev => 'tag', shape => qr/^(?!rect|circle)/i,
                target => $target, title => 1, type => 0},
       b => {class => 1, id => 1, style => 1, title => 1},
       base => {href => 1, target => $target},
       basefont => {color => 1, face => 1, size => 1},
       bgsound => {loop => 1},
       big => {},
       blockquote => {cite => 1, title => 1},
       body => {alink => 1,
                background => 1, bgcolor => 1, bgproperties => 0,
                bottommargin => 1, class => 1,
                id => 1,
                leftmargin => 1, link => 1,
                marginheight => 1, marginwidth => 1,
                onload => 1, onresize => 1, onunload => 1,
                rightmargin => 1, scroll => 0, style => 1,
                text => 1, topmargin => 1, vlink => 1},
       br => {class => 1, clear => 1, id => 1, onclick => 1, style => 1},
       button => {class => 1, id => 1, name => 2, title => 1, type => 2},
       caption => {align => 2, class => 1},
       center => {style => 1},
       code => {},
       col => {align => 2, span => 1, valign => 2},
       colgroup => {align => 2, span => 1, valign => 2},
       dd => {class => 1, id => 1, style => 1},
       del => {datetime => 0},
       div => {align => 2, class => 1, id => 1, lang => 1, mode => 0,
               style => 1, title => 1,
              },
       dl => {class => 1, id => 1},
       dt => {class => 1, id => 1, style => 1},
       em => {class => 1},
       embed => {align => 2, allowscriptaccess => 1, autostart => 2,
                 bgcolor => 1, height => 1, loop => 2, name => 2,
                 pluginspage => 1, quality => 2,
                 showcontrols => 2, src => 1, type => 2,
                 width => 1, wmode => 1,
                },
       fieldset => {class => 1},
       font => {class => 1, color => 1, face => 2, 
                lang => 1, size => 1, style => 1,
                title => 1,
               },
       form => {accept => 'tag', 'accept-charset' => 'tag', action => 1,
                autocomplete => 'tag', class => 1,
                enctype => qr[^(?!application/x-www-form-urlencoded|multipart/form-data)]i,
                id => 1,
                method => qr[^(?!get|post)]i, name => 2,
                onsubmit => 1,
                style => 1, target => $target,
               },
       frame => {name => 2, noresize => 1, scrolling => 2, src => 1},
       frameset => {
                    border => 1, cols => 1, framespacing => 1, rows => 1},
       h1 => {align => 2, class => 1, id => 1, style => 1},
       h2 => {align => 2, class => 1, id => 1, style => 1},
       h3 => {align => 2, class => 1, id => 1, style => 1},
       h4 => {align => 2, class => 1, id => 1, style => 1},
       h5 => {'>' => 'tag', align => 2, class => 1,},
       h6 => {'>' => 'tag', align => 2, class => 1},
       head => {profile => 'tag'},
       hr => {align => 2, class => 1, color => 1, 
              noshade => 1, size => 1, style => 1, width => 1},
       html => {dir => 1, lang => 1, id => 1, version => 'tag',
                'xml:lang' => 1, xmlns => qr[^(?!http://www.w3.org/1999/xhtml)]},
       i => {class => 1, style => 1},
       iframe => {align => 2, allowtransparency => 1, border => 1,
                  frameborder => 1, height => 1, hspace => 1,
                  id => 1, 
                  marginheight => 1, marginwidth => 1, name => 2,
                  onload => 1,
                  scrolling => 2, src => 1, style => 1,
                  title => 1, vspace => 1, width => 1},
       ilayer => {name => 2, visibility => 2},
       img => {align => 2, alt => 1, border => 1,
               class => 1, copyright => 'tag',
               galleryimg => 0,
               height => 1, hspace => 1, id => 1, ismap => 1,
               localsrc => 0, name => 2,
               naturalsizeflag => 1, nosave => 0,
               onclick => 1, onmousedown => 1, onmouseout => 1, onmouseover => 1,
               onmouseup => 1,
               src => 1, style => 1, title => 1,
               usemap => qr/^(?!#)/, vspace => 1, width => 1,
              },
       input => {accept => 'tag', 'accept-charset' => 'tag', accesskey => 1,
                 align => 1, alt => 1, autocomplete => 'tag',
                 autosave => 0, border => 1, 
                 checked => 1, class => 1, disabled => 1, 
                 emptyok => 0, enctype => 0, format => 0,
                 height => 1, hspace => 1,
                 id => 1, inputmode => 0, istyle => 0, localsrc => 0,
                 maxlength => 1, method => 0,
                 mode => 0, name => 2,
                 onblur => 1,
                 onchange => 1, onclick => 1, onfocus => 1, onkeydown => 1,
                 onkeypress => 1, onkeyup => 1,
                 onmouseout => 1, onmouseover => 1,
                 placeholder => 0,
                 readonly => 1, results => 0, size => 1, src => 1, style => 1,
                 tabindex => qr/^(?!1$)/, target => 0,
                 title => 1,
                 type => qr/^(?!text|password|image|submit|button|checkbox|radio|file|hidden|reset)/i,
                 value => 1, vspace => 1, width => 1,
                },
       ins => {datetime => 0},
       label => {
                 class => 1, for => 1, style => 1,
                },
       layer => {name => 2, visibility => 2},
       legend => {},
       li => {class => 1, id => 1, style => 1, type => 1, value => 1},
       link => {charset => 2,
                href => 1, hreflang => 2, media => 2,
                rel => qr/^(?!chapter|contents|stylesheet|alternate|help|prev|next|search|shortcut icon|icon|top|up)/i,
                rev => qr/^(?!made)/i,
                target => 0, title => 1, type => 2,
                xmlns => 1},
       marquee => {align => 2, behavior => 2, direction => 2, loop => 1},
       map => {id => 1, name => 1},
       meta => {
                charset => 0, content => 1,
                'http-equiv' => qr/^(?!content-(?>type|language|style-type|script-type)|cache-control|expires|imagetoolbar|pragma|pics-label|refresh)/i,
                lang => 1,
                name => qr/^(?!author|keywords|robots|description|generator)/i,
                scheme => 0,
                url => 0, 'xml:lang' => 1},
       nobr => {},
       noframes => {},
       noscript => {},
       object => {align => 2, classid => 2,
                  codebase => 1, codetype => 2, copyright => 0,
                  height => 1, id => 1, name => 2, standby => 0, type => 2,
                  width => 1,
                 },
       ol => {class => 1, id => 1, start => 1, style => 1, type => 1},
       option => {label => 1, mode => 0, selected => 1, value => 1},
       optgroup => {'>' => 'tag', label => 1},
       p => {align => 2, class => 1, id => 1, mode => 0, style => 1, wrap => 0},
       param => {name => 2, value => 1, valuetype => 0},
       pre => {class => 1, style => 1, wrap => 0},
       q => {cite => 0},
       rt => {'>' => 'tag', rbspan => 0},
       s => {},
       script => {charset => 2, event => 0, for => 0,
                  language => qr[^(?!javascript$)]i, src => 1,
                  type => qr[^(?!text/javascript)]i},
       select => {class => 1,
                  disabled => 1, id => 1, name => 2, onchange => 1,
                  size => 1, style => 1,
                  title => 1,
                 },
       small => {class => 1, style => 1},
       spacer => {type => 0},
       span => {class => 1, id => 1, lang => 1, 
                onclick => 1, style =>1, 'xml:lang' => 1},
       strike => {},
       strong => {class => 1, style => 1},
       style => {media => 2, type => qr[^(?!text/css)]i},
       sub => {},
       sup => {},
       table => {align => 2, background => 1, bgcolor => 1,
                 border => 1, bordercolor => 1,
                 cellpadding => 1,
                 cellspacing => 1, class => 1,
                 frame => 0, height => 1, id => 1, noborder => 0,
                 style => 1, summary => 0, width => 1},
       tbody => {align => 2, class => 1, valign => 2},
       tfoot => {'>' => 'tag'},
       title => {lang => 1, 'xml:lang' => 1},
       td => {abbr => 0, align => 2, axis => 0,
              background => 1, bgcolor => 1,
              class => 1, colspan => 1,
              headers => 0, height => 1, id => 1, nowrap => 1,
              onclick => 1, onmouseout => 1, onmouseover => 1,
              rowspan => 1, scope => 'tag', style => 1,
              valign => 2, width => 1},
       textarea => {
                    accesskey => 1, autocomplete => 'tag',
                    class => 1, cols => 1, id => 1, istyle => 0, mode => 0,
                    name => 2, 
                    onchange => 1, onfocus => 1,
                    readonly => 1, rows => 1, style => 1, tabindex => 'tag',
                    title => 1, value => 1, wrap => 'tag',
                   },
       tfoot => {align => 2, valign => 2},
       th => {abbr => 0, align => 2, axis => 0,
              bgcolor => 1, class => 1, colspan => 1, headers => 0,
              height => 1, id => 1, nowrap => 1,
              rowspan => 1, scope => qr/^(?!row$|col$)/i, style => 1,
              title => 1, valign => 2, width => 1},
       thead => {align => 2, class => 1, valign => 2},
       tr => {align => 1, bgcolor => 1, class => 1, height => 1, 
              id => 1, style => 1, valign => 1},
       tt => {class => 1},
       u => {},
       ul => {class => 1, id => 1, style => 1, title => 1, type => 2},
       xml => {'>' => 'tag', charset => 0},
      };

  $dom = 'Message::DOM::DOMImplementation';
}

sub tokenize ($) {
  my $s = \($_[0]);
  my $p = Whatpm::HTML->new;
  my $i = 0;
  $p->{set_next_char} = sub {
    my $self = shift;
    $self->{next_char} = -1 and return if $i >= length $$s;
    $self->{next_char} = ord substr $$s, $i++, 1;
    
    if ($self->{next_char} == 0x000D) { # CR
      if ($i >= length $$s) {
        #
      } else {
        my $next_char = ord substr $$s, $i++, 1;
        if ($next_char == 0x000A) { # LF
          #
        } else {
          push @{$self->{char}}, $next_char;
        }
      }
      $self->{next_char} = 0x000A; # LF # MUST
    } elsif ($self->{next_char} > 0x10FFFF) {
      $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
    } elsif ($self->{next_char} == 0x0000) { # NULL
      $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
    }
  };
              
  my @token;
  $p->{parse_error} = sub {
    push @token, 'ParseError';
  };
      
  $p->_initialize_tokenizer;

  my $start_tag_name = {};
  my $end_tag_name = {};
  my $attr;
  my $value;
  my $value2; # case insensitive

  while (1) {
    my $token = $p->_get_next_token;
    last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();

    if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN) {
      if ({
           title => 1,
           textarea => 1,
          }->{$token->{tag_name}}) {
        $p->{content_model} = Whatpm::HTML::RCDATA_CONTENT_MODEL ();
      } elsif ({
                style => 1,
                script => 1,
                xmp => 1,
                noframes => 1,
                noembed => 1,
                noscript => 1,
                iframe => 1,
               }->{$token->{tag_name}}) {
        $p->{content_model} = Whatpm::HTML::CDATA_CONTENT_MODEL ();
      } elsif ($token->{tag_name} eq 'plaintext') {
        $p->{content_model} = Whatpm::HTML::PLAINTEXT_CONTENT_MODEL ();
      }
      $p->{last_emitted_start_tag_name} = $token->{tag_name};
    }

    if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN () or
        $token->{type} == Whatpm::HTML::END_TAG_TOKEN ()) {

      if ($tag_info->{$token->{tag_name}}) {
        my $el_v = $tag_info->{$token->{tag_name}}->{'>'};
        if ($el_v) {
          if ($el_v eq 'tag') {
            $tag{'tag_name:'.$token->{tag_name}} = 1;
          } else {
            #
          }
        } else {
          # 
        }
      } else {
        $tag{'tag_name:'.$token->{tag_name}} = 1;
      }

      for my $attr_name (keys %{$token->{attributes}}) {
        my $attr_v = $tag_info->{$token->{tag_name}}->{$attr_name};
        if ($attr_v) {
          my $attr_value = $token->{attributes}->{$attr_name}->{value};
          if (ref $attr_v) {
            if ($attr_value =~ /$attr_v/) {
              $tag{'value:'.$token->{tag_name}.'@'.$attr_name.'@'.lc $attr_value} = 1;
            }
          } elsif ($attr_v eq 'tag') {
            $tag{'value:'.$token->{tag_name}.'@'.$attr_name.'@'.lc $attr_value} = 1;
          }
        } else {
          $tag{'attr:'.$token->{tag_name}.'@'.$attr_name} = 1;
        }

        if ({
             cite => 1, href => 1, src => 1, code => 1, classid => 1,
             data => 1, action => 1, longdesc => 1,
            }->{$attr_name}) {
          my $value = $token->{attributes}->{$attr_name}->{value};
          $value =~ s/\s+/ /g;
          my $abs_uri = $dom->create_uri_reference ($value)
              ->get_absolute_reference ($uri);
          $abs_uri->uri_fragment (undef);
          my $scheme = lc $abs_uri->uri_scheme;
          if ($scheme eq 'http' or $scheme eq 'https') {
            if ($attr_name eq 'href' or
                $attr_name eq 'cite' or 
                $attr_name eq 'longdesc' or
                $attr_name eq 'data' or
                {
                  frame => 1,
                  iframe => 1,
                }->{$token->{tag_name}}) {
              $abs_uri = $abs_uri->uri_reference;
              unless ($done->{$abs_uri} or $error->{$abs_uri}) {
                $waiting->{$abs_uri} = 1;
              }
            }
          } elsif ({
                    clsid => 1,
                    ftp => 1,
                    mailto => 1,
                    javascript => 1,
                   }->{$scheme}) {
            #
          } else {
            $tag{'scheme:'.$scheme} = 1;
          }
        }

        if ($token->{tag_name} eq 'meta' and
            $attr_name eq 'content') {
          my $av = $token->{attributes}->{content}->{value};
          if ($token->{attributes}->{'http-equiv'}) {
            my $hv = lc $token->{attributes}->{'http-equiv'}->{value};
            if ($hv eq 'content-type') {
              if ($av !~ m[^\s*text/html\s*;\s*charset\s*=\s*[0-9A-Za-z+._-]+\s*$]i) {
                $tag{'meta_content-type:'.$av} = 1;
              }
            } elsif ($hv eq 'content-style-type') {
              if ($av !~ m[^(?>text/css)$]i) {
                $tag{'meta_content-style-type:'.$av} = 1;
              }
            } elsif ($hv eq 'content-script-type') {
              if ($av !~ m[^(?>text/javascript)$]i) {
                $tag{'meta_content-script-type:'.$av} = 1;
              }
            } elsif ($hv eq 'pics-label') {
#              if ($av !~ m[^(?>\(PICS-1.1 "http://www.icra.org/ratingsv02.html" l r \(cz 1 lz 1 nz 1 oz 1 vz 1\) gen true for "[^"]+" r \(cz 1 lz 1 nz 1 oz 1 vz 1\) "http://www.rsac.org/ratingsv01.html" l r \(n 0 s 0 v 0 l 0\) gen true for "[^"]+" r \(n 0 s 0 v 0 l 0\)\))$]) {
#                $tag{'meta_pics-label:'.$av} = 1;
#              }
            }
          }
        }
          
        }
    }
  }
} # tokenize
