#!/usr/bin/perl

# classify.pl - list most significant words in a text
#               based on http://en.wikipedia.org/wiki/Tfidf

# Eric Lease Morgan <eric_morgan@infomotions.com>
# April 10, 2009 - first investigations; based on search.pl
# April 12, 2009 - added dynamic corpus


# define
use constant STOPWORDS    => 'stopwords.inc';
use constant LOWERBOUNDS  => .02;
use constant NUMBEROFTAGS => 5;

# use/require
use strict;
require 'subroutines.pl';

# initialize
my @corpus = &corpus;

# index, sans stopwords
my %index = ();
foreach my $file (@corpus ) { $index{ $file } = &index( $file, &slurp_words( STOPWORDS ) ) }

# classify (tag) each document
foreach my $file ( @corpus ) {

	print $file, "\n";
	my $tags = &classify( \%index, $file, [ @corpus ] );
	my $found = 0;
	
	# list tags greater than a given score
	foreach my $tag ( sort { $$tags{ $b } <=> $$tags{ $a } } keys %$tags ) {
	
		if ( $$tags{ $tag } > LOWERBOUNDS ) {
		
			print "\t", $$tags{ $tag }, "\t$tag\n";
			$found = 1;
			
		}
		
		else { last }
	
	}
	
	# accomodate tags with low scores
	if ( ! $found ) {
	
		my $n = 0;
		foreach my $tag ( sort { $$tags{ $b } <=> $$tags{ $a } } keys %$tags ) {
			
			print "\t", $$tags{ $tag }, "\t$tag\n";
			$n++;
			last if ( $n == NUMBEROFTAGS );
			
		}
	
	}
	
	print "\n";
	
}

# done; more fun!
exit;


