#!/usr/bin/perl -w use strict; # Check a TREC 2007 million-query track retrieval track submission for various # common errors: # * extra fields # * multiple run tags # * missing or extraneous topics # * invalid retrieved documents # * duplicate retrieved documents in a single topic # * too many documents retrieved for a topic # * fewer than maximum allowed retrieved for a topic (warning) # Messages regarding submission are printed to an error log # Results input file is in the form # topic_num Q0 docno rank sim tag # Script uses UNIX sort routine to ensure input is sorted by increasing # topic number and decreasing sim. If run on non-unix system, # use alternate open command, but make sure input file is sorted # Change these variable values to the directory in which the error log # should be put my $errlog_dir = "."; # If more than 25 errors, then stop processing; something drastically # wrong with the file. my $MAX_ERRORS = 25; my @topics; my $MAX_RET = 1000; my $task; # task run is submitted to (argument) my %docnos; # hash of all valid docnos my %numret; # number of docs retrieved per topic my $results_file; # input file to be checked my $errlog; # file name of error log my ($q0warn, $num_errors); # flags for errors detected my $d; # current docid my $line; # current input line my ($topic_string,$q0,$docno,$rank,$sim,$tag,$rest); my $line_num; # current input line number my ($topic, $old_topic); my $run_id; my $found; my ($i,$t,$col1,$col2,$last_i); my $usage = "Usage: $0 (trial|official) resultsfile\n"; $task = shift @ARGV or die $usage; $results_file = shift @ARGV or die $usage; # Initialize data structures used in checks if ($task eq 'trial') { @topics = 1 .. 100; } elsif ($task eq 'official' or $task eq 'unpooled') { @topics = 1 .. 10000; } else { die $usage; } # number retrived difficulty value assigned per topic foreach $t (@topics) { $numret{$t} = 0; } # Sort the input file by topic_num, sim and read result # ASSUMES UNIX; FOR non-unix, comment out this open, and use # alternate open --- make sure file is sorted! open RESULTS, "nl -s' ' -nln -ba -fn $results_file | sort -k2,2n -k6,6gr |" || die "Unable to open (or sort) results file $results_file: $!\n"; #open RESULTS, "<$results_file" || # die "Unable to open results file $results_file: $!\n"; my @path = split "/", $results_file; my $base = pop @path; $errlog = $errlog_dir . "/" . $base . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; $q0warn = 0; $num_errors = 0; $line_num = 0; $old_topic = "-1"; $run_id = ""; while ($line = ) { chomp $line; next if ($line =~ /^\s*$/); undef $tag; my @fields = split " ", $line; $line_num++ unless scalar(@fields) == 7; if (scalar(@fields) == 7) { ($line_num,$topic_string,$q0,$docno,$rank,$sim,$tag) = @fields; } elsif (scalar(@fields) == 6) { # Input was sorted outside the check script ($topic_string,$q0,$docno,$rank,$sim,$tag) = @fields; } else { &error("Too many fields"); exit 255; } # make sure runtag is ok if (! $run_id) { # first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9]{1,12}$/) { &error("Run tag `$run_id' is malformed"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } # get topic number if ($topic_string ne $old_topic) { $old_topic = $topic_string; while ($topic_string =~ /^0/) { $topic_string = substr $topic_string, 1; } $topic = $topic_string; if (!exists($numret{$topic})) { &error("Unknown topic ($topic_string)"); $topic = 0; next; } %docnos = (); } # make sure second field is "Q0" if ($q0 ne "Q0" && ! $q0warn) { $q0warn = 1; &error("Field 2 is `$q0' not `Q0'"); } # make sure DOCNO known and not duplicated if ($docno =~ /^GX\d\d\d-\d\d-\d\d\d\d\d\d\d\d?$/) { # valid DOCNO if (exists $docnos{$docno} && $docnos{$docno} eq $topic) { &error("Document `$docno' retrieved more than once for topic $topic"); next; } $docnos{$docno} = $topic; } else { # invalid DOCNO &error("Unknown document `$docno'"); next; } # remove leading 0's from rank (but keep final 0!) $rank =~ s/^0*//; if (! $rank) { $rank = "0"; } $numret{$topic}++; } # Do global checks: # error if some topic has no (or too many) documents retrieved for it # warning if too few documents retrieved for a topic foreach $t (@topics) { if ($numret{$t} == 0) { &error("No documents retrieved for topic $t"); } elsif ($numret{$t} > $MAX_RET) { &error("Too many documents ($numret{$t}) retrieved for topic $t"); } elsif ($numret{$t} < $MAX_RET) { print ERRLOG "$0 of $results_file: WARNING: only $numret{$t} documents retrieved for topic $t\n" } } print ERRLOG "Finished processing $results_file\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; if ($num_errors) { exit 255; } exit 0; # print error message, keeping track of total number of errors # line numbers refer to SORTED file since that is the actual input file sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } }