#!/usr/bin/perl

if (@ARGV < 2) {
  die "Usage:  sampled_evals.pl [-q] <qrel_file> <trec_file>\n\n";
}

#print "\n\n\nARGV gelio : @ARGV";

# Get names of qrel and trec files; check for -q option.

if (@ARGV == 3) {
  shift;                                # Remove -q.
  $print_all_queries = 1;
  }

$qrel_file = shift;                     # Shift implicitly acts on @ARGV.
$trec_file = shift;

# look for -G option  with <lvl> = <num> where level is relevance level
# and num is the gain value
#
if ($#ARGV > 0)  {  # there is the -G option
  shift;  # -G option removed
  for($i=0;$i<=$#ARGV;$i++) {
     $rel_map = $ARGV[$i];
     # extract the relevance
     $original_rel = substr($rel_map, 0,1);
     $updated_rel = substr($rel_map, 2,3);
     $rel_mappings{$original_rel} = $updated_rel;
  }
}

# Process qrel file first.

open(QREL, $qrel_file) or
  die "Failed to open $qrel_file: $!\n\n";

{
local $/ = undef;                       # Reads grab the whole file.
@data = split(/\s+/, <QREL>);           # Data array has all values from the
}                                       # file consecutively.

close(QREL) or
  die "Couldn't close $qrel_file: $!\n\n";


# Now take the values from the data array (four at a time) and
# put them in a data structure.  Here's how it will work.
#
# %qrel is a hash whose keys are topic IDs and whose values are
# references to hashes.  Each referenced hash has keys which are
# doc IDs and values which are relevance values.  In other words...
#
# %qrel                         The qrel hash.
# $qrel{$topic}                 Reference to a hash for $topic.
# $qrel{$topic}->{$doc_id}      The relevance of $doc_id in $topic.
# $category{$topic}->{$doc_id}      The category (which subpool) of $doc_id in $topic.

# Now the sampled values for each category
# $sampled_rel{$topic}->{$category}	number of sampled relevant documents within each $category
# $sampled_rels_per_grade{$topic}->{$category}->{$rel}	 number of sampled documents that have relevance grade $rel
# $sampled_docs{$topic}->{$category}	number of sampled documents within each category
# $docs_per_category{$topic}->{$category}	 	number of documents within each category		
# $num_rel{$topic}               Hash whose values are (estimated) number
#                               of docs relevant for each topic.
# $num_rels_per_grade{$topic}->{$rel}		estimated number of documents with relevance grade $rel

 while (($topic, $dummy, $doc_id, $doc_category, $rel) = splice(@data,0,5)) {
  if(exists($rel_mappings{$rel})) {  # if a relevance mapping is provided, map it
     $mapped_rel = $rel_mappings{$rel};
     $rel = $mapped_rel;
  }  

  $qrel{$topic}->{$doc_id} = $rel;
  $category{$topic}->{$doc_id} = $doc_category; 
  $num_rel{$topic} += $rel;
  $docs_per_category{$topic}->{$doc_category} += 1;
  if($rel >= 0)
  {
    $sampled_docs{$topic}->{$doc_category} += 1;
  }    
  if($rel > 0)
  {
    $sampled_rels_per_grade{$topic}->{$doc_category}->{$rel} += 1;
    $sampled_rel{$topic}->{$doc_category} += 1;
  } 
} #end while

# Estimate the total number of relevant documents for each topic (needed by AP)
foreach $topic (sort keys %qrel) {
  foreach $doc_category (sort keys %{$docs_per_category{$topic}}) 
  {
    if($sampled_docs{$topic}->{$doc_category} !=0) {
     $rel_estimates_category = $sampled_rel{$topic}->{$doc_category}*$docs_per_category{$topic}->{$doc_category}/$sampled_docs{$topic}->{$doc_category};
     $num_rels{$topic} += $rel_estimates_category;
   } #end if
 } #end foreach
} #end foreach

# Estimate the optimal DCG value (discount function 1/log(r+1))
# To compute, first estimate the estimated number of relevant documents within each grade
foreach $topic (sort keys %qrel) {
  foreach $doc_category (sort keys %{$docs_per_category{$topic}}) {
    foreach $rel_grade (sort keys %{$sampled_rels_per_grade{$topic}->{$doc_category}}) { 
      $num_rels_per_grade{$topic}->{$rel_grade} += ($sampled_rels_per_grade{$topic}->{$doc_category}->{$rel_grade})*$docs_per_category{$topic}->{$doc_category}/$sampled_docs{$topic}->{$doc_category};
    }
  }
}

# Now you can compute the optimal dcg value
foreach $topic (sort keys %qrel) {
  $start_rank = 0;
  foreach $rel_grade (reverse sort keys %{$num_rels_per_grade{$topic}})  {
    for ($r=($start_rank+1);$r<=($start_rank+$num_rels_per_grade{$topic}->{$rel_grade});$r++)  {
       $optimal_dcg{$topic} += $rel_grade/(log($r+1)/log(2));
       if($r>=1000) {  # systems are not allowed to retrieve more than 1000docs
          last; }
      }
    $start_rank += $num_rels_per_grade{$topic}->{$rel_grade}; 
  } 
 }#

# prints estimated number of relevants
#foreach $topic (sort keys %qrel) {
# $num_rel_docs = $num_rels{$topic};
# print "$topic $num_rel_docs\n";
#}

# Now process the trec file.

# Extra machinations to find runs in our standard places
if (! -e $trec_file) {
    $trec_file = "/trec/trec17/enterprise/results/$trec_file";
}
if (-d $trec_file) {
    $trec_file = "gzip -dc $trec_file/input.gz |";
}

open(TREC, $trec_file) or
  die "Failed to open $trec_file: $!\n\n";

{
local $/ = undef;                       # Reads grab the whole file.
@data = split(/\s+/, <TREC>);           # Data array has all values from the
}                                       # file consecutively.

close(TREC) or
  die "Couldn't close $qrel_file: $!\n\n";

# Process the trec_file data in much the same manner as above.

while (($topic, $dummy, $doc_id, $dummy, $score, $dummy) = splice(@data,0,6)) {
  $topic =~ s/^0*//;
  $trec{$topic}->{$doc_id} = $score;
 }


foreach $topic (sort {$a <=> $b} keys %trec) {  # Process topics in order.
  next unless exists $qrel{$topic};
  $num_topics++;                        # Processing another topic...
  $href = $trec{$topic};                # Get hash pointer.

  # Now sort doc IDs based on scores and calculate stats.
  # Note:  Break score ties lexicographically based on doc IDs.
  # Note2: Explicitly quit after 1000 docs to conform to TREC while still
  #        handling trec_files with possibly more docs.

  # SAP_category{$category}	# holds the sum of the precisions at relevant document wihtin each category
  # gain_category{$category}    # discounted gain values within each category
  # $num_sampled{$category}	# number of sampled documents within $category upto current rank
  # $num_relevant{$category}	# number of sampled relevant documents within $category upto current rank
  # $num_docs{#category}	# number of documents that fall in to $category upto current rank
  # $num_depth100		# number of depth100 documents upto current rank

 $num_depth100 = 0;
 $rank = 0;

 $num_ret = 0;                         # Initialize number retrieved.
 $num_rel_ret = 0;                     # Initialize number relevant retrieved.
 $sum_prec = 0;                        # Initialize sum precision.

 # Initialize the hashes 
 %SAP_category = ();
 %gain_category = ();
 %num_sampled = ();
 %num_relevant = ();
 %num_docs = ();

  foreach $doc_id (sort
    { ($href->{$b} <=> $href->{$a}) || ($b cmp $a) } keys %$href) {
    $rank = $rank +1; 

    $rel = $qrel{$topic}->{$doc_id};    # Doc's relevance.
    $doc_category = $category{$topic}->{$doc_id}; # The category of this document
        
 
      if ($rel > 0) { # this document is relevant
        # estimate the precision above this relevant document
        $prec_above = 0;
        foreach $category_val (sort keys %{$docs_per_category{$topic}}){
           # compute precisions for all categories		           
           if($num_depth100!= 0) {
             # probability of picking a document from this category
             $prob_category = $num_docs{$category_val}/$num_depth100;
             if($prob_category !=0) {
                $prec_above += $prob_category*($num_relevant{$category_val} + 0.00001)/($num_sampled{$category_val} + 0.00003);
              }
            }
	  }
        # estimated precision at relevant document
        $prec = 1/$rank + ($num_depth100/$rank)*$prec_above; 
        $SAP_category{$doc_category} += $prec;     
        $num_relevant{$doc_category} += 1;

        # compute the discounted cumulative gain within this category
        $gain_category{$doc_category} += $rel/(log($rank+1)/log(2));
      }

    if(exists($qrel{$topic}->{$doc_id})) { # this document is in depth 100 pool
      $num_depth100 += 1; 
      $num_docs{$doc_category} += 1;

      if ($rel >= 0) { # this document is sampled
        $num_sampled{$doc_category} += 1;
      }
    }

    if ($rank >= 1000) {
      last;
      }
    }
    # Now estimate the average precision value
    $AP = 0;
    foreach $category_val (sort keys %{$docs_per_category{$topic}}){
      if($sampled_docs{$topic}->{$category_val} !=0) {
        
         #estimated number of relevant documents that fall in this category
         $rel_estimates_category = $sampled_rel{$topic}->{$category_val}*$docs_per_category{$topic}->{$category_val}/$sampled_docs{$topic}->{$category_val};
        
          if($num_rels{$topic} != 0) {
          # probability fo picking a relevant document from this category
          $prob_category = $rel_estimates_category/$num_rels{$topic};
         
          # expected value of average precision within this category
          $AP_category = 0;
          if($sampled_rel{$topic}->{$category_val} != 0) {
          $AP_category = $SAP_category{$category_val}/$sampled_rel{$topic}->{$category_val};
         } 
          # expected value of average precision
          $AP += $prob_category*$AP_category;
        } # end if
     } #end if
  } # end foreach

  # estimate the dcg value
   $dcg_val = 0;
   foreach $category_val (sort keys %{$docs_per_category{$topic}}){
     if($num_depth100!= 0) {
        # probability of picking a document from this category
        $prob_category = $num_docs{$category_val}/$num_depth100;
         
        if($num_sampled{$category_val} != 0) {
          $dcg_val += $prob_category*$gain_category{$category_val}/$num_sampled{$category_val};
        }
      }
   }

  #Now compute the NDCG value
  $ndcg_val = 0; 
  if($optimal_dcg{$topic} != 0) {
    $ndcg_val = $num_depth100*$dcg_val/$optimal_dcg{$topic};
   }

  #print "OPTIMAL DCG : $optimal_dcg{$topic} DEPTH100 : $num_depth100 DCG VAL : $dcg_val\n"; 
  #exit;

 if ($print_all_queries) {
         printf "infAP\t\t$topic\t\t%6.4f\n", $AP;
	 printf "infNDCG\t\t$topic\t\t%6.4f\n", $ndcg_val;
  }

 $sum_avg_prec += $AP;
 $sum_ndcg += $ndcg_val
}

$mean_avg_prec = $sum_avg_prec/$num_topics;
$mean_ndcg = $sum_ndcg/$num_topics;

printf "infAP\t\tall\t\t%6.4f\n", $mean_avg_prec;
printf "infNDCG\t\tall\t\t%6.4f\n", $mean_ndcg;