检查网页中断的链接

#!/usr/local/bin/perl

use 5.005;
use strict;
use Getopt::Long;
use LWP::UserAgent;
use HTML::Parser;
use Pod::Usage;
use Term::ReadKey;
use URI;

$main::VERSION = "1.04";


package HTML::Parser::Links;

use base qw(HTML::Parser);

sub new
{
  my($class, $base) = @_;

  my $parser = new HTML::Parser;
  $parser->{base }  = $base;
  $parser->{links}  = [];
  $parser->{fragment} = {};

  bless $parser, $class
}


sub start
{
  my($parser, $tag, $attr, $attrseq, $origtext) = @_;

  $tag eq 'base' and defined $attr->{href} and
$parser->{base} = $attr->{href};

  $tag eq 'a' and $attr->{href} and do
  {
my $base = $parser->{base};
my $href = $attr->{href};
my $uri = new_abs URI $href, $base;
push @{$parser->{links}}, $uri;
  };

  $tag eq 'a' and $attr->{name} and do
  {
my $name = $attr->{name};
$parser->{fragment}{$name} = 1;
  };

  $tag eq 'img' and $attr->{src} and do
  {
my $base = $parser->{base};
my $src = $attr->{src};
my $uri = new_abs URI $src, $base;
push @{$parser->{links}}, $uri;
  };

  $tag eq 'frame' and $attr->{src} and do
  {
my $base = $parser->{base};
my $src = $attr->{src};
my $uri = new_abs URI $src, $base;
push @{$parser->{links}}, $uri;
  };
}


sub links
{
  my $parser = shift;
  $parser->{links}
}


sub check_fragment
{
  my($parser, $fragment) = @_;
  $parser->{fragment}{$fragment}
}


package HTTP::A11N;

# We hoist these into a base class,
# because we need them in both Page and Link

sub get_authorized
{
  my($self, $ua, $request, $response) = @_;

  my $challenge = $response->www_authenticate;
  my($scheme, $realm) = $self->parse_challenge($challenge);
  $scheme eq 'basic' or return $response;

  my $a11n = $self->{a11n};
  my $credentials = $a11n->credentials($request->uri, $realm);
  $credentials or return $response;

  $request->authorization_basic(@$credentials);
  $ua->request($request)
}


sub parse_challenge
{
  my($self, $challenge) = @_;

  my($scheme, $realm) =
$challenge =~ m[    (\w +)  # scheme
\s+
realm="([^"]+)" # realm
            ]ix;

  $scheme = lc $scheme;

  ($scheme, $realm)
}


package Page;

use base qw(HTTP::A11N);

sub new
{
  my($package, $uri, $a11n) = @_;

  $Page::Cache{$uri} and
return $Page::Cache{$uri};

  my $page = { uri => $uri,
     base => $uri,
     a11n => $a11n};

  bless $page, $package;

  $Page::Cache{$uri} = $page;
  $page
}


sub uri { shift->{'uri' } }
sub base { shift->{'base'} }


sub get
{
  my $page = shift;

  defined $page->{content} and
return $page->{content};

  my $uri   = $page->{uri};
  my $ua    = new LWP::UserAgent;
  my $request = new HTTP::Request GET => $uri;
  my $response = $ua->request($request);

  $response->code == 401 and
$response = $page->get_authorized($ua, $request, $response);

  $response->is_success or
return undef;

  $page->{base}     = $response->request->uri;
  $page->{content}   = $response->content;
  $page->{content_type} = $response->content_type;

  $response->content
}


sub parse
{
  my $page = shift;

  $page->{parser} and
return $page->{parser};

  my $content = $page->get;
  defined $content or
return undef;

  my $parser = new HTML::Parser::Links $page->base;
    $parser->parse($content);
    $parser->eof;

  $page->{parser} = $parser;
  $parser
}


sub links
{
  my $page  = shift;
  my $parser = $page->parse;
  defined $parser or
return undef;

  $parser->links
}

sub content_type { shift->{content_type} }


package Link;

use base qw(HTTP::A11N);

sub new
{
  my($package, $uri, $a11n, %options) = @_;
  
  $Link::Cache{$uri} and
return $Link::Cache{$uri};

  my $base   = $uri ->clone;
  my $fragment = $base->fragment(undef);
  
  my $link = { uri   => $uri,
a11n   => $a11n,
options => \%options,
     base   => $base,
     fragment => $fragment };

  bless $link, $package;

  $Link::Cache{$uri} = $link;
  $link
}


sub check
{
  my $link = shift;

  defined $link->{ok} and
return $link->{ok};

  my $fragment = $link->{fragment};
  my $no_nulls = not $link->{options}{'null-frags'};
  my $check  = (length $fragment or
          defined $fragment and $no_nulls) ? 'check_fragment' :
                    'check_base';

  my $ok = $link->$check();
  $link->{ok} = $ok;

  $ok
}


sub check_fragment
{
  my $link   = shift;
  my $base   = $link->{base};
  my $fragment = $link->{fragment};

  my $page   = new Page $base;
  my $parser  = $page->parse;
  defined $parser or return '';

  $link->{content_type} = $page->content_type;

  $parser->check_fragment($fragment)
}


sub check_base
{
  my $link = shift;
  my $base = $link->{base};

  my $ua    = new LWP::UserAgent;
  my $request = new HTTP::Request HEAD => $base;
  my $response = $ua->request($request);

  $response->code == 401 and
$response = $link->get_authorized($ua, $request, $response);

  # Some servers don't like HEAD requests
  $response->is_success or do
  {
$request = new HTTP::Request GET => $base;
$response = $ua->request($request);

$response->code == 401 and
  $response = $link->get_authorized($ua, $request, $response);
  };

  $link->{content_type} = $response->content_type;
  $response->is_success;
}

sub content_type { return shift->{content_type} }


sub below_or_equal
{
  my($link, $page) = @_;
  my $checked = $link->{uri}->path;
  my $orig  = $page->{uri}->path;

  $checked  =~ s|/[^/]*$||;  # remove last component
  $orig    =~ s|/[^/]*$||;

  substr($checked, 0, length $orig) eq $orig
}


package A11N; # A-uthorizatio-N

sub new
{
  my($package, $spaces) = @_;
  my $a11n = bless { }, $package;
  $a11n->spaces($spaces);
  $a11n
}


sub spaces
{
  my($a11n, $spaces) = @_;

  for my $space (@$spaces)
  {
$space eq '-' and $a11n->{deferred} = 1      , next;
$space eq '*' and $a11n->{global } = $a11n->prompt, next;
             $a11n->space($space);
  }
}


sub space
{
  my($a11n, $space) = @_;

  my($scheme, $authority, $realm) =
$space =~ m[^
  (?: (\w +):// )? #scheme
     ([^:]+)    #authority
  (?: :(.  *)  )? #realm
  $
  ]x;

  $authority or return;
  $scheme  or $scheme = 'http';

  $a11n->{credentials}{$scheme}{$authority}{$realm} =
$a11n->prompt($scheme, $authority, $realm);
}


sub credentials
{
  my($a11n, $url, $realm) = @_;

  my($scheme, $authority) =
$url =~ m[^
  (\w +):// #scheme
  ([^/]+)   #authority
  ]x;

  $a11n->{credentials}{$scheme}{$authority}{$realm} ||
  $a11n->{credentials}{$scheme}{$authority}{''  } ||
  $a11n->{global}                  ||
  $a11n->deferred($scheme, $authority, $realm)
}


sub deferred
{
  my($a11n, $scheme, $authority, $realm) = @_;

  $a11n->{deferred} or return undef;

  my $credentials = $a11n->prompt($scheme, $authority, $realm);
  $a11n->{credentials}{$scheme}{$authority}{$realm} = $credentials;

  $credentials
}


sub prompt
{
  my($a11n, $scheme, $authority, $realm) = @_;

  print "Enter credentials ";
  print "for $scheme://$authority:$realm" if $authority;
  print "\n";
  print "user ID: ";
  my $userID = <STDIN>;
  chomp $userID;

  Term::ReadKey::ReadMode('noecho');
  print "password: ";
  my $password = Term::ReadKey::ReadLine(0);
  print "\n";
  Term::ReadKey::ReadMode('normal');

  [ $userID, $password ]
}


package Spinner;

use vars qw($N @Spin);

@Spin = ('|', '/', '-', '\\');

sub Spin
{
  print STDERR $Spin[$N++], "\r";
  $N==4 and $N=0;
}


package main;

my %Checked;
my($Scheme, $Authority, $Path);
my($Pages, $Links, $Broken) = (0, 0, 0);

my %Options = (parent => 1,
    scheme => 1);

my $ok = GetOptions(\%Options, qw(Help
 Man
 authorization=s@
 null-frags
 offsite
                 parent!
 recurse
 scheme!
 twiddle=i
 verbosity=i));
Help($ok);
my $A11N = new A11N $Options{authorization};
CheckPages(@ARGV);
Summary();


sub Help
{
  my $ok = shift;
  $ok      or pod2usage();
  $Options{Help} and pod2usage(VERBOSE=>1);
  $Options{Man} and pod2usage(VERBOSE=>2);
  @ARGV     or pod2usage();
}


sub CheckPages
{
  my @pages = @_;
  my @URIs = map { new URI $_ } @pages;

  for my $uri (@URIs)
  {
$Scheme  = $uri->scheme;
$Authority = $uri->authority;
$Path   = $uri->path;
$Path   =~ s(\w+\.html$)()i;
CheckPage($uri);
  }
}


sub CheckPage
{
  my $uri = shift;

  $Checked{$uri} and return;
  $Checked{$uri} = 1;
  $Pages++;
  Twiddle();
  print "PAGE $uri\n" if $Options{verbosity} > 1;

  my $page = new Page $uri, $A11N;
  my $links = $page->links;
  defined $links or
die "Can't get $uri\n";

  CheckLinks($page, $links);
}


sub CheckLinks
{
  my($page, $uris) = @_;
  my @uris;

  for my $uri (@$uris)
  {
$uri->scheme eq 'http' or next;
my $on_site = $uri->authority eq $Authority;
$on_site or $Options{offsite} or next;

$Links++;
Twiddle();
print "LINK $uri\n" if $Options{verbosity} > 2;

my $link = new Link $uri, $A11N, %Options;
$link->check or do
{
  Report($page, $uri);
  next;
};

$on_site or next;
    $Options{parent} or $link->below_or_equal($page) or next;
    
$link->{content_type} eq 'text/html' or next;
$uri->fragment(undef);
push @uris, $uri;
  }

  $Options{recurse} or return;

  for my $uri (@uris)
  {
CheckPage($uri);
  }
}


sub Report
{
  my($page, $link) = @_;

  my $uri = $page->uri->as_string;
    $link = $link   ->as_string;

  $Options{scheme} or do
  {
$uri =~ s($Scheme://$Authority)();
$link =~ s($Scheme://$Authority)();
  };

  $Broken++;
  print "BROKEN $uri -> $link\n" if $Options{verbosity} > 0;
}


sub Twiddle
{
  $Options{twiddle}==1 and Spinner::Spin();
  $Options{twiddle}==2 and Progress();
}

sub Progress
{
  print STDERR "$Pages pages, $Links links, $Broken broken\r";
}

sub Summary
{
  print STDERR "Checked $Pages pages, $Links links     \n";
  print STDERR "Found $Broken broken links\n";
}

__END__

=head1 NAME

B<linkcheck> - check the links on an HTML page

=head1 SYNOPSIS

B<linkcheck>
[B<--Help>]
[B<--Man>]
[B<--authorization> B<-> | B<*> | [I<scheme>]://I<authority>[:I<realm>] ]...
[B<--null-frags>]
[B<--offsite>]
[B<-->[B<no>]B<parent>]
[B<--recurse>]
[B<-->[B<no>]B<scheme>]
[B<--twiddle> I<level>]
[B<--verbosity> I<level>]
I<URI> ...

=head1 DESCRIPTION

B<linkcheck> reads the web pages at I<URI> ...,
and checks the existence of any links that it finds there.

=head1 OPTIONS

=over 4

=item B<--Help>

Print command line options and exit.

=item B<--Man>

Print man page and exit.

=item B<--authorization> B<-> | B<*> | [I<scheme>://]I<authority>[:I<realm>]

Prompt for user ID and password.

Without B<--authorization>,
links to pages that require authorization are reported as broken.

If B<--authorization -> is specified,
then B<linkcheck> prompts for user ID and password after receiving
a 401 (Unauthorized) response from a web server.

If B<--authorization> [I<scheme>://]I<authority>[:I<realm>] is specified,
then B<linkcheck> prompts immediately for user ID and password.
If the I<scheme> part is omitted, C<http> is assumed.
If the I<realm> part is omitted,
the user ID and password will be used for all realms on that authority.

If B<--authorization *> is specified,
then B<linkcheck> prompts immediately
for a single user ID and password that will be
used for all realms on all authorities.

Multiple B<--authorization> options may be specified;
B<linkcheck> prompts for a separate user ID and password for each.

=item B<--null-frags>

Allow empty fragments in URLs, e.g. C<http://foo.com/bar/baz#>

=item B<--offsite>

Check off-site links.

=item B<-->[B<no>]B<parent>

Follow links upward in the directory tree.
Enabled by default.
Without this option,
recursion is restricted to a directory tree within a web site.

=item B<--recurse>

Recursively check pages that I<URI> links to.
Doesn't recurse to off-site pages.

=item B<-->[B<no>]B<scheme>

Include the scheme://authority part when reporting broken links.
Enabled by default.

=item B<--twiddle> I<level>

Indicate activity with a twiddle

=over 4

=item Z<>0

None (default)

=item Z<>1

Spinner

=item Z<>2

Running count of pages/links checked and broken links found

=back

=item B<--verbosity> I<level>

Verbosity level: 0, 1, 2, 3

=over 4

=item Z<>0

Print final count of pages/links checked and broken links (default)

=item Z<>1

Also list broken links

=item Z<>2

Also list checked pages

=item Z<>3

Also list checked links

=back

=back

=head1 NOTES

=head2 B<--authorization>

Arguments to the B<--authorization> option may need quotes to protect
them from the shell

  --authorization \*
  --authorization 'http://www.mozilla.com:System Administrator'

=head1 CHANGES

=head2 1.04

=over 4

=item *

Added B<--authorization> option

=back

=head2 1.03

=over 4

=item *

Handle BASE elements with no href attribute, e.g.

  <base target="PerlDoc">

=back

=head2 1.02

=over 4

=item *

Added B<-->[B<no>]B<parent> option

=back

=head2 1.01

=over 4

=item *

Fixed the B<--null-frags> option

=back

=head2 1.00

=over 4

=item *

Changed from C<Getopt::Std> to C<Getopt::Long>

=item *

Added B<--null-frags> option

=item *

Checks embedded images

=item *

Checks frames

=back


=head1 SEE ALSO

Checking your links with C<linkcheck> at
http://world.std.com/~swmcd/steven/perl/pm/lc/linkcheck.html

=head1 ACKNOWLEDGMENTS

=over 4

=item *

Vlado Bahyl, <[email protected]>

=item *

Marcus Freeman, <[email protected]>

=item *

Edward J. Huff, <[email protected]>

=item *

Philippe Queinnec, <[email protected]>

=item *

Geoffrey Young, <[email protected]>

=back

=head1 AUTHOR

Steven McDougall, <[email protected]>

=head1 COPYRIGHT

Copyright 2000 by Steven McDougall. This program is free (libre)
software; you can redistribute it and/or modify it under the same
terms as Perl.

=head1 SCRIPT CATEGORIES

Web

=head1 PREREQUISITES

Getopt::Long
LWP::UserAgent
HTML::Parser
Pod::Usage
URI

=head1 README

Find broken links in a web site.