#!/usr/bin/perl =head1 NAME eval-polarity.pl - Evaluate a polarity subtask run =head1 SYNOPSIS eval-polarity.pl [options] run [adhoc-equiv] Options -qrels Specify which qrels file to use (default is qrels.opinion) -maxrank Maximum rank to score runs (default: 1000) -filter Output a trec_eval-compatible version of the run =head1 DESCRIPTION Computes measures for the polarity subtask of the blog track. To run this script, you need both the polarity run and the opinion run on which it is based. You can either provide the opinion run on the command line (adhoc-equiv), or this script will look it up in the runs_table if it exists. The run can be a literal run filename, or it can be a run found in a standard place defined in the script as '$runs_root'. For use of this script anywhere but NIST, you should give pathnames directly to your polarity and corresponding opinion task run. =head1 EVALUATION This script measures the polarity run somewhat counterintuitively: it first notes all the classifications in the polarity run file. Then, it processes the opinoin run file in (topic, descending score) order to compute the measures at each rank. The reason for this is that the polarity task is not a traditional text classification task as far as the evaluation is concerned. Because the run is classifying the documents they retrieved in a ranking, rather than the entire collection or an otherwise fixed set of documents, the polarity run will contain unjudged (unlabeled) documents and may not provide a label for all judged (labeled) documents. To produce a score that permits comparisons among runs, we score the run as if it were a ranked retrieval run. For example, the main measure for this task is R-accuracy. That is, we compute the fraction of classifications that are correct above rank R, where R is the number of known labeled documents. =head1 NOTES AND BUGS This script is relatively new and may contain bugs. The script grew out of a Perl implementation of several measures in trec_eval. Extending trec_eval to handle new measures is not hard, but to more complicated schemes where there are multiple runfiles being examined at once, it's less easy. A good test of this script is to try and reproduce the numbers using trec_eval (and whatever prefiltering of the run needs to happen). The '-f' option can be used to produce a version of the polarity run that can be run through trec_eval; when you do so, the R-precision value from trec_eval is the same as Raccuracy. New measures are extended by adding measure classes at the end of the script, and then adding instances of those measures to the lists at the beginning of the script. The %qrel hash contains "measures" that are qrels file stats, as well as the qrels itself; the @measures and @qrel_measures lists are the actual measures that are printed for the run being processed. =cut use strict; use Getopt::Long; use Pod::Usage; my $root = "/trec/trec16"; my $runs_table = "$root/reports/runs_table"; my $track_root = "$root/blog"; my $runs_root = "$track_root/results"; my $qrels = "$track_root/eval/qrels.opinion"; my $max_rank = 1000; my $filter_out = undef; GetOptions('qrels=s' => \$qrels, 'maxrank=i' => \$max_rank, 'filter=s' => \$filter_out, ) or die pod2usage(2); my $run = shift or die pod2usage(); my $opinion_run = shift or undef; my $runtag; my $runfile; my %polarity; my %qrel = ('num_rel' => Sum->new(name => 'num_rel'), 'num_opinion' => Sum->new(name => 'num_opinion'), 'num_2' => Sum->new(name => 'num_2'), 'num_3' => Sum->new(name => 'num_3'), 'num_4' => Sum->new(name => 'num_4'), ); my @qrel_measures = ($qrel{num_opinion}, $qrel{num_2}, $qrel{num_3}, $qrel{num_4}); my @measures = (Rel_Ret->new(2), Raccuracy->new(), AccAtCutoff->new(minrel => 2, cutoff => 5, name => 'Acc5'), AccAtCutoff->new(minrel => 2, cutoff => 10, name => 'Acc10'), AccAtCutoff->new(minrel => 2, cutoff => 15, name => 'Acc15'), AccAtCutoff->new(minrel => 2, cutoff => 20, name => 'Acc20'), AccAtCutoff->new(minrel => 2, cutoff => 30, name => 'Acc30'), AccAtCutoff->new(minrel => 2, cutoff => 100, name => 'Acc100'), AccAtCutoff->new(minrel => 2, cutoff => 200, name => 'Acc200'), AccAtCutoff->new(minrel => 2, cutoff => 500, name => 'Acc500'), AccAtCutoff->new(minrel => 2, cutoff => 1000, name => 'Acc1000'), ); if (-r $run) { $runfile = $run; ($runtag = $run) =~ s/\.gz//; } elsif (-r "$runs_root/$run/input") { $runfile = "$runs_root/$run/input"; $runtag = $run; } elsif (-r "$runs_root/$run/input.gz") { $runfile = "$runs_root/$run/input.gz"; $runtag = $run; } $runfile = "gzip -dc $runfile |" if $runfile =~ /\.gz$/; open RUN, $runfile or die "Can't read runfile for $run: $!\n"; while () { chomp; my @fields = split; die "Error: wrong number of fields in $run at line $.\n" unless scalar @fields == 3; my ($topic, $docno, $label) = split; $polarity{$topic}{$docno} = $label; } close RUN; if (!defined $opinion_run and -r $runs_table) { open RUNSTABLE, $runs_table or die "Can't read $runs_table: $!\n"; while () { next unless /^$runtag:/; my @fields = split /:/; $opinion_run = $fields[8]; last; } close RUNTABLE; } die "No opinion run given or found in runs table\n" unless defined $opinion_run; die "Can't find opinion run $opinion_run\n" unless -r "$runs_root/$opinion_run/input.gz"; my $opinion_runfile = "$runs_root/$opinion_run/input.gz"; my $cur_topic; my @run_docnos; my %run_sims; my %run_ranks; my @topics; my %eval; open QRELS, $qrels or die "Can't read qrels file $qrels: $!\n"; while () { chomp; my ($topic, undef, $docno, $rel) = split; $qrel{$topic}{$docno} = $rel; $qrel{num_rel}->accumulate($topic) if $rel > 0; $qrel{num_opinion}->accumulate($topic) if $rel > 1; $qrel{"num_$rel"}->accumulate($topic) if $rel > 1; } close QRELS; open RUN, "gzip -dc $opinion_runfile |" or die "Can't read $run: $!\n"; if (defined $filter_out) { open OUT, ">$filter_out" or die "Can't write to $filter_out: $!\n"; } while () { chomp; my ($topic, undef, $docno, $rank, $sim, $tag) = split; next unless exists $qrel{$topic}; if (!exists $polarity{$topic}{$docno}) { die "Opinion run contains an unclassified document ($topic, $docno)\n"; } if (defined $filter_out) { if ($polarity{$topic}{$docno} == $qrel{$topic}{$docno}) { print OUT "$topic 0 $docno $rank $sim $tag\n"; } else { print OUT "$topic 0 no-$docno $rank $sim $tag\n"; } } $cur_topic = $topic if (!defined $cur_topic); if ($cur_topic != $topic) { @run_docnos = sort by_sim_docno @run_docnos; compute_measures($cur_topic, \@run_docnos); @run_docnos = (); %run_sims = (); %run_ranks = (); push @topics, $cur_topic; $cur_topic = $topic; } next if scalar(@run_docnos > $max_rank); if (exists $run_sims{$docno}) { die "Document $docno retrieved more than once for topic $topic, line $.\n"; } push @run_docnos, $docno; $run_sims{$docno} = $sim; $run_ranks{$docno} = $rank; $eval{$topic}{num_ret}++; } close RUN; if (defined $filter_out) { close OUT; } # Last topic @run_docnos = sort by_sim_docno @run_docnos; push @topics, $cur_topic; compute_measures($cur_topic, \@run_docnos); compute_averages(); # - - - - - - - - - - - sub by_sim_docno { $run_sims{$b} <=> $run_sims{$a} or $b cmp $a } sub compute_measures { my ($topic, $docs) = @_; my $rank = 1; for my $doc (@$docs) { for my $meas (@measures) { $meas->accumulate($topic, $rank, $doc); } $rank++; } for my $meas (@qrel_measures, @measures) { $meas->print($topic); } } sub compute_averages { for my $meas (@qrel_measures, @measures) { $meas->print_avg; } } ###### # # Definitions of measures { package Measure; # Measure base class. # Implements a simple measure that has an increment counter per topic # and a floating-point average. sub new { my $class = shift; my %params = @_; my $self = { NAME => $params{name} || 'count', VALUE => {}, }; bless($self, $class); return $self; } # Ensure that a VALUE entry exists for a topic. sub touch { my ($self, $topic) = @_; $self->{VALUE}{$topic} = 0 unless exists $self->{VALUE}{$topic}; } # 'accumulate' is an operation done for each document in a ranked list. # This implementation bumps the counter. sub accumulate { my ($self, $topic, $rank, $docno) = @_; # print "Measure.accumulate $topic $rank $docno\n"; $self->{VALUE}{$topic}++; return $self->{VALUE}{$topic}; } # 'normalize' should return a normalized value for a topic; it does # not affect the VALUE entry. 'normalize' is called when printing # or averaging the VALUE. # This implementation just returns the counter itself. sub normalize { my ($self, $topic) = @_; # print "Measure.norm $topic\n"; return $self->{VALUE}{$topic}; } # 'average' should return a reduction over all VALUEs. Typically this is # an average but might be a sum, harmonic mean, or whatever. I wanted # to call it 'reduce' but that seemed just too obscure. # This implementation returns an arithmetic mean. sub average { my $self = shift; my $sum = 0; my $num_topics = 0; for my $topic (keys %{ $self->{VALUE} }) { $sum += $self->normalize($topic); $num_topics++; } my $avg = $sum / $num_topics; # print "Measure.avg is $avg ($sum / $num_topics)\n"; return $avg; } # Get/set the name of the measure. sub name { my $self = shift; if (@_) { $self->{NAME} = shift; } return $self->{NAME}; } # Get the value. Note that values are only set by 'accumulate', not # by this method. sub value { my ($self, $topic) = @_; return $self->{VALUE}{$topic}; } # Print a normalized value for a topic sub print { my ($self, $topic) = @_; my $val = $self->normalize($topic); if ($val == int($val)) { printf "%-15s\t%s\t%d\n", $self->name, $topic, $val; } else { printf "%-15s\t%s\t%6.4f\n", $self->name, $topic, $val; } } # Print the average over all topics sub print_avg { my $self = shift; my $avg = $self->average; if ($avg == int($avg)) { printf "%-15s\tall\t%d\n", $self->name, $self->average; } else { printf "%-15s\tall\t%6.4f\n", $self->name, $self->average; } } } { package Sum; # The default measure reduces with a mean; this reduces with a sum. use base qw/Measure/; sub average { my $self = shift; my $sum = 0; for my $topic (keys %{ $self->{VALUE} }) { $sum += $self->{VALUE}{$topic}; } return $sum; } } { package SetMeasure; # A measure that computes a set of submeasures along with it. use base qw/Measure/; sub new { my $class = shift; my $self = $class->SUPER::new(@_); $self->{LIST} => []; bless($self, $class); return $self; } sub print_set { my ($self, $topic) = @_; for my $meas (@{ $self->{LIST} }) { $self->{$meas}->print($topic); } } sub print_set_avgs { my $self = shift; for my $meas (@{ $self->{LIST} }) { $self->{$meas}->print_avg; } } sub print { my ($self, $topic) = @_; $self->print_set($topic); $self->SUPER::print($topic); } sub print_avg { my $self = shift; $self->print_set_avgs; $self->SUPER::print_avg; } } { package Rel_Ret; use base qw/Sum Measure/; sub new { my $class = shift; my $rel_lvl = shift || 1; my $self = $class->SUPER::new(); if ($rel_lvl != 1) { $self->{NAME} = "rel_${rel_lvl}_ret"; } else { $self->{NAME} = 'rel_ret'; } $self->{REL_LVL} = $rel_lvl; bless($self, $class); return $self; } sub accumulate { my ($self, $topic, $rank, $docno) = @_; $self->touch($topic); $self->SUPER::accumulate($topic, $rank, $docno) if $qrel{$topic}{$docno} >= $self->{REL_LVL}; } } { package Raccuracy; use base qw/SetMeasure/; sub new { my $class = shift; my $self = $class->SUPER::new(name => 'Raccuracy'); $self->{num_correct} = Sum->new(name => 'Racc_ncorr'); $self->{num_correct_2} = Sum->new(name => 'Racc_ncorr_2'); $self->{num_correct_3} = Sum->new(name => 'Racc_ncorr_3'); $self->{num_correct_4} = Sum->new(name => 'Racc_ncorr_4'); $self->{LIST} = [ 'num_correct', 'num_correct_2', 'num_correct_3', 'num_correct_4' ]; bless($self, $class); return $self; } sub accumulate { my ($self, $topic, $rank, $docno) = @_; $self->touch($topic); if ($rank <= $qrel{num_opinion}->value($topic) and $polarity{$topic}{$docno} == $qrel{$topic}{$docno}) { $self->SUPER::accumulate($topic, $rank, $docno); $self->{num_correct}->accumulate($topic); $self->{"num_correct_$polarity{$topic}{$docno}"}->accumulate($topic); } } sub normalize { my ($self, $topic) = @_; return $self->{VALUE}{$topic} / $qrel{num_opinion}->value($topic); } } { package AvePrec; use base qw/Measure/; sub new { my $class = shift; my $self = $class->SUPER::new(name => 'map'); bless($self, $class); return $self; } sub accumulate { my ($self, $topic, $rank, $docno) = @_; $self->touch($topic); if ($qrel{$topic}{$docno} > 0) { $self->{REL_SO_FAR}{$topic}++; $self->{VALUE}{$topic} += $self->{REL_SO_FAR} / $rank; } } sub normalize { my ($self, $topic) = @_; return $self->{VALUE}{$topic} / $qrel{num_rel}->value($topic); } } { package PrecAtCutoff; use base qw/Measure/; sub new { my $class = shift; my %params = @_; my $cutoff = $params{cutoff} || $max_rank; my $rel_lvl = $params{minrel} || 1; my $self = $class->SUPER::new(); if (exists $params{name}) { $self->{NAME} = $params{name}; } else { $self->{NAME} = "P$cutoff"; } $self->{CUTOFF} = $cutoff; $self->{REL_LVL} = $rel_lvl; bless($self, $class); return $self; } sub accumulate { my ($self, $topic, $rank, $docno) = @_; $self->touch($topic); if ($rank <= $self->{CUTOFF} and $qrel{$topic}{$docno} >= $self->{REL_LVL}) { $self->{VALUE}{$topic} += 1.0 / $self->{CUTOFF}; } } } { package AccAtCutoff; use base qw/PrecAtCutoff/; sub accumulate { my ($self, $topic, $rank, $docno) = @_; $self->touch($topic); if ($rank <= $self->{CUTOFF} and $qrel{$topic}{$docno} == $polarity{$topic}{$docno}) { $self->{VALUE}{$topic} += 1.0 / $self->{CUTOFF}; } } }