#!/usr/bin/perl # #----------------------------------------------------------------------------# # PROGRAM: most-hits.pl # # PURPOSE: This program serves one purpose. It reads an Apache # access_log file in standard ECLF format, # and print out the number of hits that have been # recorded for each file and/or directory. # # The user can control the number of files that are # displayed in the output by changing NUM_RECS_TO_PRINT. # # This tool lets you analyze your web site so you can # understand what content your viewers are interested in. # # USAGE: # # most-hits.pl access_log > results # perl most-hits.pl access_log > results # #----------------------------------------------------------------------------# #----------------------------------------------------------------------------# # COPYRIGHT: # # # # This sample program is provided free of charge under the terms of the # # GNU GPL. # #----------------------------------------------------------------------------# use File::Basename; #------------------------------------------------------------------------------# # Global variables that control the program action and output. # #------------------------------------------------------------------------------# $NUM_RECS_TO_PRINT = 100; # num of output recs to print per section #---------------------------------------------------------------------# # Change this array to include index filenames used on your system. # #---------------------------------------------------------------------# @indexFilenames = ('index.htm', 'index.html', 'index.shtml'); #----------------------------------------------------------------------# # don't change anything below here unless you're comfortable with Perl # #----------------------------------------------------------------------# sub usage { print STDERR "\n\tUsage: logHBF.pl access_log_file > output_file\n"; } #----------------------------------------------------------# # These are two helper routines for the 'sort' function. # #----------------------------------------------------------# sub fileNumericAscending { $numFileRequests{$a} <=> $numFileRequests{$b}; } sub fileNumericDescending { $numFileRequests{$b} <=> $numFileRequests{$a}; } sub trim($) { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } #----------------------------<< main >>-----------------------------# #--------------------------------------------------------------------# # Start by making sure the user is invoking this program properly. # #--------------------------------------------------------------------# $numArgs = $#ARGV + 1; if ($numArgs != 1) { &usage; exit 1; } $logFile = $ARGV[0]; open (LOGFILE,"$logFile") || die " Error opening log file $logFile.\n"; #------------------------------------------------------------------# # Start reading and processing the access_log file in this loop. # #------------------------------------------------------------------# printf "
\n"; while(\n"; # the end) { chomp; #----------------------------------------------# # condense one or more whitespace character # # to one single space # #----------------------------------------------# s/\s+/ /go; #----------------------------------------------------------# # the next line breaks each line of the access_log into # # nine variables # #----------------------------------------------------------# ($clientAddress, $rfc1413, $username, $localTime, $httpRequest, $statusCode, $bytesSentToClient, $referer, $clientSoftware) = /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+) \"(.*)\" \"(.*)\"/o; #--------------------------------------------------------------------# # take care of problem where the $httpRequest may simply be a hyphen # #--------------------------------------------------------------------# next if ($httpRequest =~ '^-$'); #-----------------------------------------# # Determine the value of $fileRequested # #-----------------------------------------# ($getPost, $fileRequested, $junk) = split(' ', $httpRequest, 3); #--------------------------------------------------------# # ignore hits to the following file types. # this section of code needs to be fixed so the user can # declare extensions to ignore at the top of the program #--------------------------------------------------------# if ($fileRequested =~ /\.gif$/i) { next; } if ($fileRequested =~ /\.jpg$/i) { next; } if ($fileRequested =~ /\.css$/i) { next; } if ($fileRequested =~ /\.png$/i) { next; } if ($fileRequested =~ /\.java$/i) { next; } if ($fileRequested =~ /favicon\.ico$/i) { next; } if ($fileRequested =~ /robots\.txt$/i) { next; } #-----------------------------------------------------------------# # if the base filename is something like index.htm, index.html, # # or index.shtml, interpret this to be the same as the path by # # itself. This way, '/java/' is the same as '/java/index.html'. # #-----------------------------------------------------------------# foreach $indexFile (@indexFilenames) { chomp($fileRequested); $fileRequested = trim($fileRequested); if ($fileRequested =~ /^\s+$/) { next; } if ($fileRequested =~ /^$/) { next; } if (basename($fileRequested) =~ /$indexFile/i) { $fileRequested = dirname($fileRequested); last; } } #----------------------------------------------------------------# # If the last character in $fileRequested is a '/', remove it. # # This makes /perl/ equal to /perl. # #----------------------------------------------------------------# if (length($fileRequested) > 1) { if (substr($fileRequested,length($fileRequested)-1,1) eq '/') { chop($fileRequested); } } #-----------------------------------------------------# # here's where we count the number of hits per file # #-----------------------------------------------------# $numFileRequests{$fileRequested}++; } close (LOGFILE); #--------------------------------------# # Output the number of hits per file # #--------------------------------------# print "TOP $NUM_RECS_TO_PRINT MOST-REQUESTED FILES:\n"; print "-----------------------------\n\n"; $count=0; foreach $key (sort fileNumericDescending (keys(%numFileRequests))) { last if ($count >= $NUM_RECS_TO_PRINT); print "$numFileRequests{$key} \t\t $key\n"; $count++; } print "\n\n"; printf "