#!/usr/bin/perl
# Written by Ben Carterette and Evangelos Kanoulas
# Last update October 18, 2010

use Getopt::Long;

my $qrels;
my $runs = "";
my $k = 10;
my $br = 2;
my $bq = 4;
my $discstring = "";
my $alltopic = 0;

GetOptions('k=i', \$k,
		   'br=i', \$br,
		   'bq=i', \$bq,
		   'discount=s', \$discstring,
		   'runs=s', \$runprefix,
		   'qrels=s', \$qrels,
		   'q=i', \$alltopic);

if (!$qrels)
{
die("Usage:  sndcg.pl -qrels <qrels>
Options:  -runs <path to runs; default cwd>
          -q <0/1; default 0, use 1 to print info for all topics>
          -k <rank cutoff; default 10>
          -br <base of rank discount log; default 2
          -bq <base of query discount log; default 4>
          -discount <discount function; default 1/((log_bq(reform+bq-1))*((log_br(rank+br-1))
           with rank being between 1 and 2k;see comments in script for more on specifying 
           a discount function\n");
}

# discstring is a perl closure you can specify on the command line
# see examples below

# first call to shift() returns rank (integer between 1 and 2*k)
# second call to shift() returns reformulation number
# br is rank discount base, bq is reform discount base
# on command line, escape $
if ($discstring eq "")
{
	#$discstring = "sub { return 1/(log(shift()+$br-1)/log($br)*log(shift()+$bq-1)/log($bq)); }";

	# 1/log_b(rank+b)
	# $discstring = "sub { return 1/(log(shift()+$br-1)/log($br)); }";

	# 1/(1+log_b(rank))
	# $discstring = "sub { return 1/(1+log(shift())/log($br)); }";

        # 1/((log_q(reform+q-1))*((log_b(rank+b-1)) with rank being between 1 and 2k
	$discstring = "sub { return 1/((log(shift()+$br-1)/log($br))*(log(shift()+$bq-1)/log($bq))); }";

        # 1/((1+log_q(reform))*((1+log_b(rank)) with rank being between 1 and 2k
	# $discstring = "sub { return 1/((1+log(shift())/log($br))*(1+log(shift())/log($bq))); }";

        # 1/((log_q(reform+q-1))*(log_b(rank+b-1))) is tricky b/c rank needs to be between 1 and k, not 1 and 2k
        # $discstring = "sub { my \$rank = shift(); \$rank = \$rank % \$k; \$rank = \$k if (\$rank == 0); return 1/((log(\$rank+$br-1)/log($br))*(log(shift()+$bq-1)/log($bq))); }";

	# 1/((1+log_q(reform))*(1+log_b(rank))) is tricky b/c rank needs to be between 1 and k, not 1 and 2k
	#$discstring = "sub { my \$rank = shift(); \$rank = \$rank % \$k; \$rank = \$k if (\$rank == 0); return 1/((1+log(\$rank)/log($br))*(1+log(shift())/log($bq))); }";
}

print "discount function:  $discstring\n";
my $discount = eval($discstring);

open Q, "qrels";
while (<Q>)
{
	chop;
	my @i = split(/\s+/);
	my ($r1, $r2) = split(/\./, $i[3]);

	$drift{$i[0]} = 0;
	$rel{$i[0]}{$i[2]} = $r1;
	$nhr{$i[0]}++ if ($r1 == 2);
	$nnr{$i[0]}++ if ($r1 == 1);

	if ($r2 != -1)
	{
		$drift{$i[0]} = 1;
		$rel2{$i[0]}{$i[2]} = $r2;
		$nhr2{$i[0]}++ if ($r2 == 2);
		$nnr2{$i[0]}++ if ($r2 == 1);
	}
}
close Q;

$qcount = scalar(keys %rel);

foreach my $t (sort {$a <=> $b} keys %rel)
{
	# ideal dcg
	for ($i=0; $i<$nhr{$t} && $i<$k; $i++)
	{
		$idcg{$t} += (2**2-1)*$discount->($i+1, 1);
	}
	for (; $i<$nhr{$t}+$nnr{$t} && $i<$k; $i++)
	{
		$idcg{$t} += (2**1-1)*$discount->($i+1, 1);
	}

	# if drifting, do a second ideal dcg
	if ($drift{$t})
	{
		for ($i=0; $i<$nhr2{$t} && $i<$k; $i++)
		{
			$idcg2{$t} += (2**2-1)*$discount->($i+1, 1);
		}
		for (; $i<$nhr2{$t}+$nnr2{$t} && $i<$k; $i++)
		{
			$idcg2{$t} += (2**1-1)*$discount->($i+1, 1);
		}
	}
	else
	{
		$idcg2{$t} = $idcg{$t};
	}

	# ideal sdcg, different for drift vs nondrift
	if (!$drift{$t})
	{
		for ($i=0; $i<$nhr{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**2-1)*$discount->($i+1, 1);
		}
		for (; $i<$nhr{$t}+$nnr{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**1-1)*$discount->($i+1, 1);
		}

                for ($i=0; $i<$nhr{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**2-1)*$discount->($i+$k+1, 2);
		}
		for (; $i<$nhr{$t}+$nnr{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**1-1)*$discount->($i+$k+1, 2);
		}
	}
	else
	{
		for ($i=0; $i<$nhr{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**2-1)*$discount->($i+1, 1);
		}
		for (; $i<$nhr{$t}+$nnr{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**1-1)*$discount->($i+1, 1);
		}

		for ($i=0; $i<$nhr2{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**2-1)*$discount->($i+$k+1, 2);
		}
		for (; $i<$nhr2{$t}+$nnr2{$t} && $i<$k; $i++)
		{
			$isdcg{$t} += (2**1-1)*$discount->($i+$k+1, 2);
		}
	}


        # ideal sdcg, duplicates taken into account, different for drift vs nondrift
	if (!$drift{$t})
	{
		for ($i=0; $i<$nhr{$t} && $i<2*$k; $i++)
		{
			$qnum = int(($i)/$k);
			$isdcg_dupes{$t} += (2**2-1)*$discount->($i+1, $qnum+1);
		}
		for (; $i<$nhr{$t}+$nnr{$t} && $i<2*$k; $i++)
		{
			$qnum = int(($i)/$k);
			$isdcg_dupes{$t} += (2**1-1)*$discount->($i+1, $qnum+1);
		}
	}
	else
	{
	    $isdcg_dupes{$t} = $isdcg{$t};
	}
}

my (%sndcg1, %sndcg2);  # sndcg RL1 -> RL2; sndcg RL1 -> RL3
my (%sndcg1_dupes, %sndcg2_dupes);  # sndcg RL1 -> RL2; sndcg RL1 -> RL3 ignoring dupes
my (%ndcg1, %ndcg2, %ndcg3);  # ndcg RL1, RL2, RL3

my @runs = glob("$runprefix*RL1");
foreach my $r (@runs)
{
	(my $base = $r) =~ s/\.RL1//;
	#print STDERR "$base\n";
	my %rank;

	my %dupes=();

	open F, "$base.RL1";
	while (<F>)
	{
		chop;
		my @i = split(/\s+/);
		$rank{$i[0]}++;
		next if ($rank{$i[0]} > $k);

		my $g = 2**$rel{$i[0]}{$i[2]} - 1;
		my $disc = $discount->($rank{$i[0]}, 1);

		$ndcg1{$base}{$i[0]} += $g*$disc;
		$sndcg1{$base}{$i[0]} += $g*$disc;
		$sndcg2{$base}{$i[0]} += $g*$disc;
		$sndcg1_dupes{$base}{$i[0]} += $g*$disc;
		$sndcg2_dupes{$base}{$i[0]} += $g*$disc;
		
		$dupes{$i[0]}{$i[2]}=1 if (!$drift{$i[0]});
	}
	close F;
	
	%rank = ();
	my %ndupes1 = ();

	open F, "$base.RL2";
	while (<F>)
	{
		chop;
		my @i = split(/\s+/);
		$rank{$i[0]}++;
		next if ($rank{$i[0]} > $k);
		
		my $g=0;
		if (!$drift{$i[0]}){$g = 2**$rel{$i[0]}{$i[2]} - 1;}
		else{$g = 2**$rel2{$i[0]}{$i[2]} - 1;}
		my $disc1 = $discount->($rank{$i[0]}, 1);
		my $disc2 = $discount->($k+$rank{$i[0]}, 2);

		$ndcg2{$base}{$i[0]} += $g*$disc1;
		$sndcg1{$base}{$i[0]} += $g*$disc2;
		
		if (!exists $dupes{$i[0]}{$i[2]}){
		    $sndcg1_dupes{$base}{$i[0]} += $g*$disc2;}
		else{
		    $ndupes1{$i[0]}{$rel{$i[0]}{$i[2]}}++;}

	}
	close F;

	%rank = ();
	my %ndupes2 = (); 
	open F, "$base.RL3";
	while (<F>)
	{
		chop;
		my @i = split(/\s+/);
		$rank{$i[0]}++;
		next if ($rank{$i[0]} > $k);

		my $g=0;
		if (!$drift{$i[0]}){$g = 2**$rel{$i[0]}{$i[2]} - 1;}
		else{$g = 2**$rel2{$i[0]}{$i[2]} - 1;}
		my $disc1 = $discount->($rank{$i[0]}, 1);
		my $disc2 = $discount->($k+$rank{$i[0]}, 2);

		$ndcg3{$base}{$i[0]} += $g*$disc1;
		$sndcg2{$base}{$i[0]} += $g*$disc2;

		if (!exists $dupes{$i[0]}{$i[2]}){
		    $sndcg2_dupes{$base}{$i[0]} += $g*$disc2;}
		else{
		    $ndupes2{$i[0]}{$rel{$i[0]}{$i[2]}}++;}
	}
	close F;
}	


printf "run topic nsDCG\@10.RL12 nsDCG\@10.RL13 / nsDCG_dupes\@10.RL12 nsDCG_dupes\@10.RL13 / nDCG\@10.RL1 nDCG\@10.RL2 nDCG\@10.RL3\n";    

foreach my $r (sort keys %ndcg1)
{
    foreach my $t (sort {$a <=> $b} keys %{$ndcg1{$r}})
    {
	next if ($nhr{$t}+$nnr{$t} == 0);  # skip if no rels?
	$sndcg1_dupes{$r}{$t} /= $isdcg_dupes{$t};
	$sndcg2_dupes{$r}{$t} /= $isdcg_dupes{$t};
	$sndcg1{$r}{$t} /= $isdcg{$t};
	$sndcg2{$r}{$t} /= $isdcg{$t};
	$ndcg1{$r}{$t} /= $idcg{$t};
	$ndcg2{$r}{$t} /= $idcg2{$t};
	$ndcg3{$r}{$t} /= $idcg2{$t};
	printf "%12s  %4d  %.4f  %.4f / %.4f  %.4f / %.4f  %.4f  %.4f\n", $r, $t, $sndcg1{$r}{$t}, $sndcg2{$r}{$t}, $sndcg1_dupes{$r}{$t}, $sndcg2_dupes{$r}{$t}, $ndcg1{$r}{$t}, $ndcg2{$r}{$t}, $ndcg3{$r}{$t} if ($alltopic);
	
	$msndcg1_dupes{$r} += $sndcg1_dupes{$r}{$t}/$qcount;
	$msndcg2_dupes{$r} += $sndcg2_dupes{$r}{$t}/$qcount;
	$msndcg1{$r} += $sndcg1{$r}{$t}/$qcount;
	$msndcg2{$r} += $sndcg2{$r}{$t}/$qcount;
	$mndcg1{$r} += $ndcg1{$r}{$t}/$qcount;
	$mndcg2{$r} += $ndcg2{$r}{$t}/$qcount;
	$mndcg3{$r} += $ndcg3{$r}{$t}/$qcount;
    }
    
    printf "%12s   all  %.4f  %.4f / %.4f  %.4f / %.4f  %.4f  %.4f\n", $r, $msndcg1{$r}, $msndcg2{$r}, $msndcg1_dupes{$r}, $msndcg2_dupes{$r}, $mndcg1{$r}, $mndcg2{$r}, $mndcg3{$r};
}
