#!/usr/bin/perl
# ---------------------------------------------------------------------------------------------
#
# outclassify.pl --- A mail classification engine for Outclass to speak to POPFile
#
# Copyright (c) 2003 Ashit Gandhi
# Portions Copyright (c) 2001-2003 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------

use strict;
use Classifier::Bayes;
use Classifier::MailParse;

my %components;
my %words;
my $engine;


# ---------------------------------------------------------------------------------------------
#
# load_word_table
#
# $bucket    The name of the bucket we are loading words for
#
# Fills the words hash with the word frequencies for word loaded from the appropriate bucket
#
# ---------------------------------------------------------------------------------------------
sub load_word_table
{
    my ($bucket) = @_;
    
    # Make sure that the bucket mentioned exists, if it doesn't the create an empty
    # directory and word table

    mkdir("corpus");
    mkdir("corpus/$bucket");
    
    print "Loading word table for bucket '$bucket'...\n";
    
    open WORDS, "<corpus/$bucket/table";
    
    # Each line in the word table is a word and a count
    
    while (<WORDS>) {
        if ( /__CORPUS__ __VERSION__ (\d+)/ ) {
            if ( $1 != 1 ) {
                print "Incompatible corpus version in $bucket\n";
                return;
            }
            
            next;
        }
            
        if ( /(.+) (.+)/ ) {
			$words{$bucket}{$1} = $2;
#			print "$words{$bucket}{$1}, {$1} = {$2}\n" if($bucket eq "foo");
        }
    }
    
    close WORDS;
}

# ---------------------------------------------------------------------------------------------
#
# save_word_table
#
# $bucket    The name of the bucket we are loading words for
#
# Writes the words hash out to a bucket
#
# ---------------------------------------------------------------------------------------------

sub save_word_table
{
    my ($bucket) = @_;

    print "Saving word table for bucket '$bucket'...\n";
    
    open WORDS, ">corpus/$bucket/table";
    print WORDS "__CORPUS__ __VERSION__ 1\n";
    
    # Each line in the word table is a word and a count
    
    foreach my $word (keys %{$words{$bucket}}) {
        print WORDS "$word $words{$bucket}{$word}\n";
#		print "$word $words{$bucket}{$word}\n";
    }
    
    close WORDS;
}

# ---------------------------------------------------------------------------------------------
#
# split_mail_message
#
# $message    The name of the file containing the mail message
#
# Splits the mail message into valid words and updated the words hash
#
# ---------------------------------------------------------------------------------------------

sub split_mail_message
{
    my ($message, $bucket) = @_;
    my $parser   = new Classifier::MailParse;
    my $word;

    print "Parsing message '$message'...\n";

    $parser->parse_stream($message);
    
    foreach $word (keys %{$parser->{words}}) {
        #$words{$word} += $parser->{words}{$word};
		$words{$bucket}{$word} += $parser->{words}{$word};
#		print "$words{$bucket}{$word} ($word)\n";
    }
}

sub init
{
    $engine = new Classifier::Bayes;

    $engine->initialize();
    $engine->{debug} = 0;
    $engine->{parser}->{debug} = 0;
    $engine->load_word_matrix();
}

sub classify_file
{
    my($filename) = @_;
    my $msgtype = "";

    my $msgtype = $engine->classify_file($filename);
#    print "$msgtype\n";
    return $msgtype;
}

# main

my $inp = "";
my $cmd = "";
my $arg1 = "";
my $arg2 = "";

select(STDOUT);
$| = 1;

print "Outclass Engine Starting up...\n";

print "Initializing Bayes filter...\n";
init();

print "Loading buckets...\n";
my @filebuckets = glob "corpus/*";
my @buckets;

foreach my $b (@filebuckets)
{
	my $bucket = "";
	($b,$bucket) = split(/\//, $b);
	load_word_table($bucket);
	push(@buckets, $bucket);
}

print "Outclass Engine Ready.\n";

while(1)
{
	print "\n+OK\n";
	$inp = <>;
	chop($inp);
	($cmd, $arg1, $arg2) = split(/=/, $inp);

#	print "$cmd\n";

	if($cmd eq "classify" && length($arg1) > 0)
	{
		print classify_file($arg1) . "\n";
	}
	elsif($cmd eq "insert" && length($arg1) > 0 && length($arg2) > 0)
	{
		print("Classify $arg2 as $arg1\n");
		# First check if the bucket has been loaded... if not, load it
		my $loaded = 0;
		foreach my $b (@buckets)
		{
			if($b eq $arg1)
			{
				$loaded = 1;
			}
		}
		if($loaded == 0)
		{
			load_word_table($arg1);
			push(@buckets, $arg1);
		}
		# insert the message...
		split_mail_message($arg2, $arg1);
		
	}
	elsif($cmd eq "save" && length($arg1) > 0)
	{
		save_word_table($arg1);
		$engine->load_word_matrix();
	}
	elsif(length($cmd) == 0)
	{
		#do nothing
	}
	elsif($cmd eq "quit")
	{
		exit(0);
	}
	else
	{
		print "ERROR: Unknown Command: '$cmd, $arg1, $arg2'\n";
	}
}
