#!/usr/bin/perl # #----------------------------------------------------------------------------# # PROGRAM: most-hits.pl # # PURPOSE: This program serves one purpose. It reads an Apache # access_log file in standard ECLF format, # and print out the number of hits that have been # recorded for each file and/or directory. # # The user can control the number of files that are # displayed in the output by changing NUM_RECS_TO_PRINT. # # This tool lets you analyze your web site so you can # understand what content your viewers are interested in. # # USAGE: # # most-hits.pl access_log > results # perl most-hits.pl access_log > results # #----------------------------------------------------------------------------# #----------------------------------------------------------------------------# # COPYRIGHT: # # # # This sample program is provided free of charge under the terms of the # # GNU GPL. # #----------------------------------------------------------------------------# use File::Basename; #------------------------------------------------------------------------------# # Global variables that control the program action and output. # #------------------------------------------------------------------------------# $NUM_RECS_TO_PRINT = 100; # num of output recs to print per section #---------------------------------------------------------------------# # Change this array to include index filenames used on your system. # #---------------------------------------------------------------------# @indexFilenames = ('index.htm', 'index.html', 'index.shtml'); #----------------------------------------------------------------------# # don't change anything below here unless you're comfortable with Perl # #----------------------------------------------------------------------# sub usage { print STDERR "\n\tUsage: logHBF.pl access_log_file > output_file\n"; } #----------------------------------------------------------# # These are two helper routines for the 'sort' function. # #----------------------------------------------------------# sub fileNumericAscending { $numFileRequests{$a} <=> $numFileRequests{$b}; } sub fileNumericDescending { $numFileRequests{$b} <=> $numFileRequests{$a}; } sub trim($) { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } #----------------------------<< main >>-----------------------------# #--------------------------------------------------------------------# # Start by making sure the user is invoking this program properly. # #--------------------------------------------------------------------# $numArgs = $#ARGV + 1; if ($numArgs != 1) { &usage; exit 1; } $logFile = $ARGV[0]; open (LOGFILE,"$logFile") || die " Error opening log file $logFile.\n"; #------------------------------------------------------------------# # Start reading and processing the access_log file in this loop. # #------------------------------------------------------------------# printf "
\n";
while()
{

   chomp;

   #----------------------------------------------#
     #  condense one or more whitespace character   #
   #  to one single space                         #
   #----------------------------------------------#

   s/\s+/ /go;

   #----------------------------------------------------------#
   #  the next line breaks each line of the access_log into   #
   #  nine variables                                          #
   #----------------------------------------------------------#

   ($clientAddress,    $rfc1413,      $username, 
   $localTime,         $httpRequest,  $statusCode, 
   $bytesSentToClient, $referer,      $clientSoftware) =
   /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+) \"(.*)\" \"(.*)\"/o;

   #--------------------------------------------------------------------#
   # take care of problem where the $httpRequest may simply be a hyphen #
   #--------------------------------------------------------------------#

   next if ($httpRequest =~ '^-$');

   #-----------------------------------------#
   #  Determine the value of $fileRequested  #
   #-----------------------------------------#

   ($getPost, $fileRequested, $junk) = split(' ', $httpRequest, 3);

   #--------------------------------------------------------#
   # ignore hits to the following file types.
   # this section of code needs to be fixed so the user can
   # declare extensions to ignore at the top of the program
   #--------------------------------------------------------#

   if ($fileRequested =~ /\.gif$/i) {
     next;
   }
   if ($fileRequested =~ /\.jpg$/i) {
     next;
   }
   if ($fileRequested =~ /\.css$/i) {
     next;
   }
   if ($fileRequested =~ /\.png$/i) {
     next;
   }
   if ($fileRequested =~ /\.java$/i) {
     next;
   }
   if ($fileRequested =~ /favicon\.ico$/i) {
     next;
   }
   if ($fileRequested =~ /robots\.txt$/i) {
     next;
   }

   #-----------------------------------------------------------------#
   #  if the base filename is something like index.htm, index.html,  #
   #  or index.shtml, interpret this to be the same as the path by   #
   #  itself.  This way, '/java/' is the same as '/java/index.html'. #
   #-----------------------------------------------------------------#

   foreach $indexFile (@indexFilenames) {
     chomp($fileRequested);
     $fileRequested = trim($fileRequested);
     if ($fileRequested =~ /^\s+$/) {
        next;
     }
     if ($fileRequested =~ /^$/) {
        next;
     }
     if (basename($fileRequested) =~ /$indexFile/i) {
        $fileRequested = dirname($fileRequested);
        last;
     }
   }

   #----------------------------------------------------------------#
   #  If the last character in $fileRequested is a '/', remove it.  #
   #  This makes /perl/ equal to /perl.                             #
   #----------------------------------------------------------------#

   if (length($fileRequested) > 1) 
   {
     if (substr($fileRequested,length($fileRequested)-1,1) eq '/') 
     {
       chop($fileRequested);
     }
   }

   #-----------------------------------------------------#
   #  here's where we count the number of hits per file  #
   #-----------------------------------------------------#

   $numFileRequests{$fileRequested}++;

}

close (LOGFILE);

#--------------------------------------#
#  Output the number of hits per file  #
#--------------------------------------#

print "TOP $NUM_RECS_TO_PRINT MOST-REQUESTED FILES:\n";
print "-----------------------------\n\n";
$count=0;
foreach $key (sort fileNumericDescending (keys(%numFileRequests))) {
   last if ($count >= $NUM_RECS_TO_PRINT);
   print "$numFileRequests{$key} \t\t $key\n";
   $count++;
}
print "\n\n";

printf "
\n"; # the end