#!/usr/bin/perl require HTML::LinkExtor; require HTML::Form; use LWP::UserAgent; use URI::URL; use DBD::SQLite; use DBI; my $dbh = DBI->connect("dbi:SQLite:dbname=url.db","",""); if (!$dbh) { die "can't connect to database!\n"; } my @queue = (); my $referer = undef; $url_base = $ARGV[0]; push @queue, $url_base; warn "dig_url_start:$url_base\n"; my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)"); my $link_parser = HTML::LinkExtor->new(\&gotlink,$url_base ); #my $form_parser = HTML::Form->new(); my $sth = $dbh->prepare("drop table links"); if ($sth) { $sth->execute; $sth->finish; } $sth=$dbh->prepare("create table links(id integer not null primary key autoincrement,referer text, url text)"); $sth->execute; $sth->finish; while (1) { last if ($#queue < 0 ); $referer = pop @queue; my $request = $ua->request( HTTP::Request->new( GET=>$referer), sub { $link_parser->parse($_[0]); } ); } $dbh->disconnect; sub gotlink { my ($tag,%attr) = @_; #$_=($attr{href}||$attr{src}||$attr{link}||$attr{background}||$attr{action}); print "$tag,"; foreach my $key (keys %attr) { $_ = $attr{$key}; my $sth_check = $dbh->prepare("select url from links where url=?"); my $f = 0; print $key."=$attr{$key}\n"; if ( /^javascript:/i or !/^$url_base/i ) { warn "skip link $_\n"; next; } s/\#.*$//; s#^$url_base##; $_="/".$_; $sth_check->bind_param(1,$_); $sth_check->execute; my @rows = $sth_check->fetchrow_array; $f=1 if (defined @rows and $#rows>=0); $sth_check->finish; if ($f < 1 ) { my $sth_update = $dbh->prepare("insert into links(referer,url) values(?,?)"); $sth_update->bind_param(1,$referer); $sth_update->bind_param(2,$_); $sth_update->execute; $sth_update->finish; } if (lc $tag eq "a" and $f == 0) { push @queue,$attr{href}; } } }