#!/usr/bin/perl
#
#----------------------------------------------------------------------------#
#   PROGRAM: most-hits.pl
#
#   PURPOSE: This program serves one purpose.  It reads an Apache
#   access_log file in standard ECLF format,
#   and print out the number of hits that have been 
#   recorded for each file and/or directory.
#
#   The user can control the number of files that are
#   displayed in the output by changing NUM_RECS_TO_PRINT.
#
#   This tool lets you analyze your web site so you can
#   understand what content your viewers are interested in.
#
#   USAGE:
#
#   most-hits.pl access_log > results
#   perl most-hits.pl access_log > results
#
#----------------------------------------------------------------------------#

#----------------------------------------------------------------------------#
# COPYRIGHT:                                                                 #
#                                                                            #
# This sample program is provided free of charge under the terms of the      #
# GNU GPL.                                                                   #
#----------------------------------------------------------------------------#

use File::Basename;

#------------------------------------------------------------------------------#
#  Global variables that control the program action and output.                #
#------------------------------------------------------------------------------#

$NUM_RECS_TO_PRINT = 100;   # num of output recs to print per section

#---------------------------------------------------------------------#
#  Change this array to include index filenames used on your system.  #
#---------------------------------------------------------------------#

@indexFilenames = ('index.htm', 'index.html', 'index.shtml');


#----------------------------------------------------------------------#
# don't change anything below here unless you're comfortable with Perl #
#----------------------------------------------------------------------#

sub usage {
print STDERR "\n\tUsage:  logHBF.pl access_log_file > output_file\n";
}


#----------------------------------------------------------#
#  These are two helper routines for the 'sort' function.  #
#----------------------------------------------------------#

sub fileNumericAscending {
$numFileRequests{$a} <=> $numFileRequests{$b};
}

sub fileNumericDescending {
$numFileRequests{$b} <=> $numFileRequests{$a};
}

sub trim($)
{
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}


#----------------------------<<   main   >>-----------------------------#

#--------------------------------------------------------------------#
#  Start by making sure the user is invoking this program properly.  #
#--------------------------------------------------------------------#

$numArgs = $#ARGV + 1;

if ($numArgs != 1) {
   &usage;
   exit 1;
}

$logFile = $ARGV[0];

open (LOGFILE,"$logFile") || die "  Error opening log file $logFile.\n";

#------------------------------------------------------------------#
#  Start reading and processing the access_log file in this loop.  #
#------------------------------------------------------------------#

printf "<pre>\n";
while(<LOGFILE>)
{

   chomp;

   #----------------------------------------------#
     #  condense one or more whitespace character   #
   #  to one single space                         #
   #----------------------------------------------#

   s/\s+/ /go;

   #----------------------------------------------------------#
   #  the next line breaks each line of the access_log into   #
   #  nine variables                                          #
   #----------------------------------------------------------#

   ($clientAddress,    $rfc1413,      $username, 
   $localTime,         $httpRequest,  $statusCode, 
   $bytesSentToClient, $referer,      $clientSoftware) =
   /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+) \"(.*)\" \"(.*)\"/o;

   #--------------------------------------------------------------------#
   # take care of problem where the $httpRequest may simply be a hyphen #
   #--------------------------------------------------------------------#

   next if ($httpRequest =~ '^-$');

   #-----------------------------------------#
   #  Determine the value of $fileRequested  #
   #-----------------------------------------#

   ($getPost, $fileRequested, $junk) = split(' ', $httpRequest, 3);

   #--------------------------------------------------------#
   # ignore hits to the following file types.
   # this section of code needs to be fixed so the user can
   # declare extensions to ignore at the top of the program
   #--------------------------------------------------------#

   if ($fileRequested =~ /\.gif$/i) {
     next;
   }
   if ($fileRequested =~ /\.jpg$/i) {
     next;
   }
   if ($fileRequested =~ /\.css$/i) {
     next;
   }
   if ($fileRequested =~ /\.png$/i) {
     next;
   }
   if ($fileRequested =~ /\.java$/i) {
     next;
   }
   if ($fileRequested =~ /favicon\.ico$/i) {
     next;
   }
   if ($fileRequested =~ /robots\.txt$/i) {
     next;
   }

   #-----------------------------------------------------------------#
   #  if the base filename is something like index.htm, index.html,  #
   #  or index.shtml, interpret this to be the same as the path by   #
   #  itself.  This way, '/java/' is the same as '/java/index.html'. #
   #-----------------------------------------------------------------#

   foreach $indexFile (@indexFilenames) {
     chomp($fileRequested);
     $fileRequested = trim($fileRequested);
     if ($fileRequested =~ /^\s+$/) {
        next;
     }
     if ($fileRequested =~ /^$/) {
        next;
     }
     if (basename($fileRequested) =~ /$indexFile/i) {
        $fileRequested = dirname($fileRequested);
        last;
     }
   }

   #----------------------------------------------------------------#
   #  If the last character in $fileRequested is a '/', remove it.  #
   #  This makes /perl/ equal to /perl.                             #
   #----------------------------------------------------------------#

   if (length($fileRequested) > 1) 
   {
     if (substr($fileRequested,length($fileRequested)-1,1) eq '/') 
     {
       chop($fileRequested);
     }
   }

   #-----------------------------------------------------#
   #  here's where we count the number of hits per file  #
   #-----------------------------------------------------#

   $numFileRequests{$fileRequested}++;

}

close (LOGFILE);

#--------------------------------------#
#  Output the number of hits per file  #
#--------------------------------------#

print "TOP $NUM_RECS_TO_PRINT MOST-REQUESTED FILES:\n";
print "-----------------------------\n\n";
$count=0;
foreach $key (sort fileNumericDescending (keys(%numFileRequests))) {
   last if ($count >= $NUM_RECS_TO_PRINT);
   print "$numFileRequests{$key} \t\t $key\n";
   $count++;
}
print "\n\n";

printf "</pre>\n";

# the end


