#! /usr/bin/perl ############################################################################### # # Squid-Prefetch (v0.5) # # Written by Brian White . # This program has been placed in the public domain (the only true "free"). # ############################################################################### use URI; use Net::HTTP; @ConfFiles = qw(./squid-prefetch.conf $ENV{HOME}/.squid-prefetch /etc/squid-prefetch.conf); $FetchPattern = 'http://.*(\.(html?|te?xt)|/[^\.]*)'; $AccessFile = ""; $LastAccess = 0; %Config; %Squid; %DoneTime; %DoneCount; %DoneFetch; %DonePrefetch; @DoneList; @LinkList; ############################################################################### sub uniq { return () unless @_; my($last) = ""; my(@new) = (); foreach (@_) { next unless $_; if ($_ ne $last) { $last = $_; push(@new,$_); } } return @new; } sub RandomizeArray { my($array,$count) = @_; my $i; $count = @$array unless ($count > 0 && $count <= @$array); for ($i=$[; $i < $count; $i++) { my $random = int(rand($count)); my $temp; $temp = $$array[$random]; $$array[$random]= $$array[$i]; $$array[$i] = $temp; } return scalar @$array; } ############################################################################### sub ReadConfig { my($file,$conf) = @_; my($parm,$valu); open(CONF,"<$file") || die "Error: could not read config file '$file' ($!)\n"; while () { chomp; s/\#.*$//; next if m/^\s*$/; if (($parm,$valu) = m/^(\w+)\s+(.*?)\s*$/) { $conf->{$parm} = $valu; } } close(CONF); } sub ConfigValue { my($parm,$default) = @_; return $Config{$parm} if (exists $Config{$parm}); return $Squid{$parm} if (exists $Squid{$parm}); return $default; } ############################################################################### sub ReadAccessLog { my(@pages); if (time() - $LastAccess > 900) { open(ACCESS,"<$AccessFile") || die "Error: could not read access log $AccessFile ($!)\n"; seek(ACCESS,0,2); # go to end of file $LastAccess = time(); } while () { $LastAccess = time(); @_ = split; next unless ($_[3] =~ m!/2! && $_[5] eq "GET" && $_[6] =~ m!http://! && $_[9] =~ m!^text/!); push @pages,$_[6]; } return @pages; } sub FetchUrl { my($url) = @_; my($data,@links); my($host,$path) = ($url =~ m!http://(?:[^/]+@)?([^/]+)(/.*?)(\#.*)?$!); unless ($host && $path) { print STDERR "Warning: could not parse URL $url\n"; return; } # limit how long we will spend doing the fetch local $SIG{ALRM} = sub { die "Error: Timeout fetching $url\n" }; alarm(3); my $http = Net::HTTP->new(PeerHost => $ProxyHost, PeerPort => $ProxyPort, Host => $host, SendTE => 0, KeepAlive => 0); $http->write_request("GET" => "http://$host$path", "Accept" => "text/html", "Cache-Control" => "only-if-cached", "User-Agent" => "Squid-Prefetch"); my ($code,$mesg,%hdrs) = $http->read_response_headers(); print "\nfetch: $url: $code ($mesg)\n"; print " $k -> $v\n" while (($k,$v) = each %hdrs); alarm(0); if ($code != 200) { print STDERR "Warning: fetch returned code $code ($mesg) for $url\n"; return; } if ($hdrs{"Content-Type"} !~ m!^text/html($|;)!) { print STDERR "Warning: fetch returned non-html content-type \"",$hdrs{"Content-Type"},"\" for $url\n"; return; } if ($hdrs{"Cache-Control"} =~ m/\bno-cache\b/) { print STDERR "Warning: no-cache directive for $url\n"; return; } if ($hdrs{"X-Cache"} =~ m/^MISS\b/ || $hdrs{"X-Cache-Lookup"} =~ m/^MISS\b/) { print STDERR "Warning: squid didn't cache $url\n"; return; } alarm(5); while (1) { my $bufr; my $size = $http->read_entity_body($bufr,4096); # print "\n$size : $bufr"; last unless ($size > 0); $data .= $bufr; while ($data =~ m!]+href\s*=\s*(\"|\'|)([^\"\'>\s]+)\1[^>]*>!gis) { my $uri = URI->new($2); my $lnk = $uri->abs($url); my $frg = ($lnk =~ s/(\#.*)$//); next if ($frg && !$FetchFragments); my $opt = ($lnk =~ s/(\?.*)$//); next if ($opt && !$FetchOptions); my($lnkh,$lnkp) = ($lnk =~ m!http://(?:[^/]+@)?([^/]+)(/.*)$!); next if (exists $DoneTime{$lnk}); next if ($lnk !~ m!^$FetchPattern$!oi); next if ($lnkh ne $host && !$FetchCrossSite); print "found $lnk\n"; unshift @links,$lnk; } $data =~ s!^.*($|<)!!; } alarm(0); @links = uniq(sort(@links)); RandomizeArray(\@links); push @LinkList,@links; } sub PrefetchUrl { my($url) = @_; my($total); my($host,$path) = ($url =~ m!http://(?:[^/]+@)?([^/]+)(/.*)$!); unless ($host && $path) { print STDERR "Warning: could not parse URL $url\n"; return; } # limit how long we will spend doing the fetch local $SIG{ALRM} = sub { die "Error: Timeout fetching $url\n" }; alarm(3); my $http = Net::HTTP->new(PeerHost => $ProxyHost, PeerPort => $ProxyPort, Host => $host, SendTE => 1, KeepAlive => 0); $http->write_request("GET" => "http://$host$path", "Accept" => "text/*", "User-Agent" => "Squid-Prefetch"); my ($code,$mesg,%hdrs) = $http->read_response_headers(); print "\nprefetch: $url: $code ($mesg)\n"; # print " $k -> $v\n" while (($k,$v) = each %hdrs); alarm(0); if ($code != 200) { print STDERR "Warning: fetch returned code $code ($mesg) for $url\n"; return; } if ($hdrs{"Content-Type"} !~ m!^text/!) { print STDERR "Warning: fetch returned non-text content-type \"$hdrs{Content-Type}\" for $url\n"; return; } if (exists $hdrs{"Content-Length"} && $hdrs{"Content-Length"} > $FetchMaxSize) { print STDERR "Warning: fetch returned oversize content-length ",$hdrs{"Content-Length"}," for $url\n"; return; } alarm(5); while (1) { my $bufr; my $size = $http->read_entity_body($bufr,4096); last unless ($size > 0); $total += $size; last if ($total > $FetchMaxSize); } alarm(0); } ############################################################################### # read our config file foreach $file (@ConfFiles) { if (-r $file) { ReadConfig($file,\%Config); } } # read Squid config file ReadConfig(ConfigValue("squid_config_file","/etc/squid/squid.conf"),\%Squid); # determine config information $AccessFile = ConfigValue("cache_access_log","/var/log/squid/access.log"); $ProxyHost = ConfigValue("http_proxy","127.0.0.1"); $ProxyPort = ConfigValue("http_port",3128); $HistorySize = ConfigValue("max_history_size",5000); $HistoryAge = ConfigValue("max_history_age",24*60*60); $FetchPattern = ConfigValue("prefetch_regex",$FetchPattern); $FetchOptions = ConfigValue("prefetch_options",0); $FetchFragments = ConfigValue("prefetch_fragments",1); $FetchMaxSize = ConfigValue("prefetch_maxsize",65536); $FetchCrossSite = ConfigValue("prefetch_cross",0); # prefetch pages while (1) { # read access log my @urls = ReadAccessLog(); my $time = time(); my @todo = (); # determine candidate pages that have recently been fetched while (@urls) { my $url = shift @urls; my $frg = ($url =~ s/(\#.*)$//); next if ($frg); my $opt = ($url =~ s/(\?.*)$//); next if ($opt); next if ($url !~ m!^$FetchPattern$!oi); # print STDERR "Note: user fetch of seen page $url\n" if ($DoneTime{$url} && !$DonePrefetch{$url}); # remember this URL $DoneTime{$url} = $time; $DoneCount{$url}++; push @DoneList,$url; # ignore those pages that appear because we prefetched them if (exists $DonePrefetch{$url}) { delete $DonePrefetch{$url}; next; } # determine if it's age makes it a candidate my $age = $DoneFetch{$url}; next if ($time - $age < $HistoryAge); # add it to the todo list delete $DoneFetch{$url}; push @todo,$url.$opt; } # remember any prefetched pages not found in log (because fetch failed) foreach (keys %DonePrefetch) { $DoneTime{$_} = $time; $DoneCount{$_}++; push @DoneList,$_; delete $DonePrefetch{$_}; print STDERR "Warning: no log info for prefetch of $_ (donetime=$DoneTime{$_})\n"; } # keep the todo list down to a reasonable size shift @todo while (scalar @todo > 1000); # fetch and analyze page from todo list while (@todo) { my $url = pop @todo; # ignore those pages we've already done prefetch for next if (exists $DoneFetch{$url}); # fetch one page and analyze for links (saved to @LinkList) $DoneFetch{$url} = $time; eval { FetchUrl($url); }; last; } # Keep list of links to a reasonable size shift @LinkList while (scalar @LinkList > 100); # prefetch one link from list while (@LinkList) { my $url = pop @LinkList; next if (exists $DoneTime{$url}); $DonePrefetch{$url} = $time; eval { PrefetchUrl($url); }; last; } # reduce the history size to be within limits while (scalar @DoneList > $HistorySize) { my $url = shift @DoneList; if (--$DoneCount{$url} <= 0) { print STDERR "Note: removing $url from history...\n"; delete $DoneCount{$url}; delete $DoneTime{$url}; delete $DoneFetch{$url}; delete $DonePrefetch{$url}; } } # wait a moment before starting all over again sleep(1); }