#!/usr/bin/perl
require HTML::LinkExtor;
require HTML::Form;
use LWP::UserAgent;
use URI::URL;
use DBD::SQLite;
use DBI;
my $dbh = DBI->connect("dbi:SQLite:dbname=url.db","","");
if (!$dbh) {
die "can't connect to database!\n";
}
my @queue = ();
my $referer = undef;
$url_base = $ARGV[0];
push @queue, $url_base;
warn "dig_url_start:$url_base\n";
my $ua = LWP::UserAgent->new;
$ua->agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)");
my $link_parser = HTML::LinkExtor->new(\&gotlink,$url_base );
#my $form_parser = HTML::Form->new();
my $sth = $dbh->prepare("drop table links");
if ($sth) {
$sth->execute;
$sth->finish;
}
$sth=$dbh->prepare("create table links(id integer not null primary key autoincrement,referer text, url text)");
$sth->execute;
$sth->finish;
while (1) {
last if ($#queue < 0 );
$referer = pop @queue;
my $request = $ua->request(
HTTP::Request->new(
GET=>$referer),
sub { $link_parser->parse($_[0]); }
);
}
$dbh->disconnect;
sub gotlink
{
my ($tag,%attr) = @_;
#$_=($attr{href}||$attr{src}||$attr{link}||$attr{background}||$attr{action});
print "$tag,";
foreach my $key (keys %attr) {
$_ = $attr{$key};
my $sth_check = $dbh->prepare("select url from links where url=?");
my $f = 0;
print $key."=$attr{$key}\n";
if ( /^javascript:/i or !/^$url_base/i ) {
warn "skip link $_\n";
next;
}
s/\#.*$//;
s#^$url_base##;
$_="/".$_;
$sth_check->bind_param(1,$_);
$sth_check->execute;
my @rows = $sth_check->fetchrow_array;
$f=1 if (defined @rows and $#rows>=0);
$sth_check->finish;
if ($f < 1 ) {
my $sth_update = $dbh->prepare("insert into links(referer,url) values(?,?)");
$sth_update->bind_param(1,$referer);
$sth_update->bind_param(2,$_);
$sth_update->execute;
$sth_update->finish;
}
if (lc $tag eq "a" and $f == 0) {
push @queue,$attr{href};
}
}
}