#!/usr/bin/perl -w
#
### Introduction ###
# sa-stats   : SpamAssassin logfile analyser
# Version    : 1.1.7
# Date       : 30th March, 2006
# Author     : David Ramsden, david@hexstream.co.uk
# Credits    : - Andrew Berry for asking about such a
#	         script and giving me the idea.
#	       - D. Scott Barninger for producing a patch
#	         to fix the strftime() problem in <= 1.1.4
#	       - Mike Jackson for his patch to add SA 3.1.0
#	         compatibility and the limit option.
#	       - Jack Scagnetti for two fixes.
#
# Usage:
#   ./sa-stats.pl -h
#
#   sa-stats.pl reads input in from STDIN:
#	./sa-stats.pl </var/log/mail.info
#	cat /var/log/mail.info | ./sa-stats.pl -t
#
#   Note: Not every Linux/UNIX distribution uses /var/log/mail.info
#         It may use /var/log/maillog
#         You will need to work this out for yourself.
#
### License (distributed under the zlib license) ###
# Copyright (c) 2006 David Ramsden
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
#	1. The origin of this software must not be misrepresented; you must
#	not claim that you wrote the original software. If you use this
#	software in a product, an acknowledgment in the product documentation
#	would be appreciated but is not required.
#
#	2. Altered source versions must be plainly marked as such, and must
#	not be misrepresented as being the original software.
#
#	3. This notice may not be removed or altered from any source
#	distribution.
#
###
use strict;
use Getopt::Long;
use POSIX qw(strftime);

### Global variable definitions ###
my %global_stats = ();
my %user_stats   = ();
my $filter       = undef;
my $userlimit    = 65534;

### Command line arguements ###
GetOptions("help|h" =>\ my $opt_help,
	   "today|t" =>\ my $opt_today,
	   "yesterday|y" =>\ my $opt_yesterday,
	   "limit|l=i" =>\ my $opt_limit);

if ($opt_help)
{
	print "Usage: $0 [options] </var/log/mail.info\n";
	print "Without any options the entire logfile is processed\n";
	print "Options can be any of the following:\n";
	printf("\t%-5s: Stats for todays date only.\n", "t");
	printf("\t%-5s: Stats for yesterdays date only.\n", "y");
	printf("\t%-5s: Stats for top n mail recipients.\n", "l n");
	exit 0;
}

if ($opt_today && $opt_yesterday)
{
	print "You can't use the -t option and -y option together.\n";
	exit 1;
}

if ($opt_today)
{
	$filter = strftime("%b %e", localtime);
}
elsif ($opt_yesterday)
{
	$filter = strftime("%b %e", localtime(time() - (24 * 60 * 60)));
}

if ($opt_limit) {
	$userlimit = $opt_limit;
}

### Main code ###
while(defined(my $line = <STDIN>))
{
	if (defined($filter))
	{
		if ($line !~ /^$filter/)
		{
			next;
		}
	}

	if ($line =~ m/spamd\[([0-9].*?)\]: clean message \((.*?)\/(.*?)\) for (.*?):/)
	{
		&update_stats("ham", $4, $2, $3);
	}
	elsif ($line =~ m/spamd: clean message \((.*?)\/(.*?)\) for (.*?):/)
	{
		&update_stats("ham", $3, $1, $2);
	}
	elsif ($line =~ m/spamd\[([0-9].*?)\]: identified spam \((.*?)\/(.*?)\) for (.*?):/)
	{
		&update_stats("spam", $4, $2, $3);
	}
	elsif ($line =~ m/spamd: identified spam \((.*?)\/(.*?)\) for (.*?):/)
	{
		&update_stats("spam", $3, $1, $2);
	}
}

&show_stats();

exit 0;

### Sub-routines ###
sub show_stats()
{
	if ($opt_today)
	{
		print "SpamAssassin statistics for today ($filter)\n";
	}
	elsif ($opt_yesterday)
	{
		print "SpamAssassin statistics for yesterday ($filter)\n";
	}
	else
	{
		print "SpamAssassin statistics for entire logfile\n";
	}
	draw_line(70);
	print "\n";

	if (exists($global_stats{'spam'}) && exists($global_stats{'ham'}))
	{
		printf("%-30s %-10s %-10s %-10s\n", "Total messages:", "Ham:", "Spam:", "% Spam:");
		draw_line(70);
		printf("%-30d %-10d %-10d %1.2f%%\n", $global_stats{'ham'}{'count'} + $global_stats{'spam'}{'count'},
						      $global_stats{'ham'}{'count'},
						      $global_stats{'spam'}{'count'},
						      100 * ($global_stats{'spam'}{'count'} / ($global_stats{'ham'}{'count'} + $global_stats{'spam'}{'count'})));

		print "\n";
		printf("%-30s: %1.2f/%1.2f\n", "Average spam score",
					       $global_stats{'spam'}{'score'} / $global_stats{'spam'}{'count'},
					       $global_stats{'spam'}{'threshold'} / $global_stats{'spam'}{'count'});
		printf("%-30s: %1.2f/%1.2f\n", "Average ham score",
					       $global_stats{'ham'}{'score'} / $global_stats{'ham'}{'count'},
					       $global_stats{'ham'}{'threshold'} / $global_stats{'ham'}{'count'});
	}
	else
	{
		if (!exists($global_stats{'ham'}))
		{
			print "No ham (clean) messages found in logfile.\n";
		}
		if (!exists($global_stats{'spam'}))
		{
			print "No spam (identified) messages found in logfile.\n";
		}
		print "Due to the above, not enough information is available to calculate\nglobal statistics.\n";
	}
	print "\n";

	printf("%-30s %-7s %-7s %-7s %-7s\n", "Username:", "Total:", "Ham:", "Spam:", "% Spam:");
	draw_line(70);
	if($opt_limit)
	{
		my $userdisp = 0;
		my %sorthash;
		foreach my $username (sort keys %user_stats)
		{
			$sorthash{$username} = $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'};
		}
		foreach my $username (sort { $sorthash{$b} <=> $sorthash{$a} } keys %user_stats)
		{
			printf("%-30s %-7d %-7d %-7d %1.2f%%\n", $username,
								    $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'},
								    $user_stats{$username}{'ham'}{'count'},
								    $user_stats{$username}{'spam'}{'count'},
								    100 * ($user_stats{$username}{'spam'}{'count'} / ($user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'})));
			$userdisp++;
			last if $userdisp >= $userlimit;
		}
	}
	else
	{
		foreach my $username (sort keys %user_stats)
		{
			printf("%-30s %-7d %-7d %-7d %1.2f%%\n", $username,
								    $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'},
								    $user_stats{$username}{'ham'}{'count'},
								    $user_stats{$username}{'spam'}{'count'},
								    100 * ($user_stats{$username}{'spam'}{'count'} / ($user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'})));
		}
	}
	print "\n";

	printf("%-30s %-20s %-20s\n", "Username:", "Avg. ham score:", "Avg. spam score:");
	draw_line(70);
	if($opt_limit)
	{
		my $userdisp = 0;
		my %sorthash;
		foreach my $username (sort keys %user_stats)
		{
			$sorthash{$username} = $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'};
		}
		foreach my $username (sort { $sorthash{$b} <=> $sorthash{$a} } keys %sorthash)
		{
			my $ham_average = "None";
			my $spam_average = "None";
			if ($user_stats{$username}{'ham'}{'count'})
			{
				$ham_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'ham'}{'score'} / $user_stats{$username}{'ham'}{'count'},
									$user_stats{$username}{'ham'}{'threshold'} / $user_stats{$username}{'ham'}{'count'});
			}
			if ($user_stats{$username}{'spam'}{'score'})
			{
				$spam_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'spam'}{'score'} / $user_stats{$username}{'spam'}{'count'},
								       $user_stats{$username}{'spam'}{'threshold'} / $user_stats{$username}{'spam'}{'count'});
			}
			printf("%-30s %-20s %-20s\n", $username, $ham_average, $spam_average);
			$userdisp++;
			last if $userdisp >= $userlimit;
		}
	}
	else
	{
		foreach my $username (sort keys %user_stats)
		{
			my $ham_average = "None";
			my $spam_average = "None";
			if ($user_stats{$username}{'ham'}{'count'})
			{
				$ham_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'ham'}{'score'} / $user_stats{$username}{'ham'}{'count'},
									$user_stats{$username}{'ham'}{'threshold'} / $user_stats{$username}{'ham'}{'count'});
			}
			if ($user_stats{$username}{'spam'}{'score'})
			{
				$spam_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'spam'}{'score'} / $user_stats{$username}{'spam'}{'count'},
								       $user_stats{$username}{'spam'}{'threshold'} / $user_stats{$username}{'spam'}{'count'});
			}
			printf("%-30s %-20s %-20s\n", $username, $ham_average, $spam_average);
		}
	}
}

sub update_stats()
{
	my $stat = shift;
	my $username = shift;
	my $score = shift;
	my $threshold = shift;

	$username = lc($username);

	if (!exists($global_stats{$stat}))
	{
		$global_stats{$stat}{'count'} = 0;
		$global_stats{$stat}{'score'} = 0;
		$global_stats{$stat}{'threshold'} = 0;
	}

	$global_stats{$stat}{'count'}++;
	$global_stats{$stat}{'score'} += $score;
	$global_stats{$stat}{'threshold'} += $threshold;

	if (!exists($user_stats{$username}))
	{
		$user_stats{$username}{'ham'}{'count'} = 0;
		$user_stats{$username}{'ham'}{'score'} = 0;
		$user_stats{$username}{'ham'}{'threshold'} = 0;
		$user_stats{$username}{'spam'}{'count'} = 0;
		$user_stats{$username}{'spam'}{'score'} = 0;
		$user_stats{$username}{'spam'}{'threshold'} = 0;
	}

	$user_stats{$username}{$stat}{'count'}++;
	$user_stats{$username}{$stat}{'score'} += $score;
	$user_stats{$username}{$stat}{'threshold'} += $threshold;
}

sub draw_line()
{
	my $length = shift;

	print "-" x $length;
	print "\n";
}
