#!/usr/local/bin/perl # Check a TREC 2002 filtering submission for various common errors: # * extra fields # * multiple run tags # * extraneous topics # * invalid retrieved documents # * duplicate retrieved documents in a single topic # Messages regarding submission are printed to an error log # Results input file is in the form # topic_num Q0 docno rank sim tag # Script uses UNIX sort routine to ensure input is sorted by increasing # topic number and sim (for routing). If run on non-unix system, # use alternate open command, but make sure input file is sorted # Note that line numbers in the error output refer to the SORTED file! # Change these variable values to the directory in which the lists # of DOCNO's reside and the directory where the error log should be put $docno_dir = "/trec/trec11/aux"; $docnos_file = "$docno_dir/docnos.filtering"; $errlog_dir = "."; # If more than 25 errors, then stop processing; something drastically # wrong with the file. $MAX_ERRORS = 25; #open(LOGFILE, ">> /tmp/LOG-of-check_input.pl"); # These values are specific to the TREC 2002 tasks $MIN_TOPIC = 101; $MAX_TOPIC = 200; $MAX_RET = 1000; if ($#ARGV != 1) { print STDERR "Usage: $0 task resultsfile\n"; print STDERR "\twhere task is either `adaptive', `batch', or `routing'\n"; die "\n";; } $task = $ARGV[0]; $results_file = $ARGV[1]; $run_type = filtering; if ($task ne "adaptive" && $task ne "batch" && $task ne "routing") { print STDERR "$0: task must be either `adaptive', `batch', or `routing'\n"; die "\n"; } #printf LOGFILE ("================\n"); #printf LOGFILE ("docnos_file is:%s:\n", $docnos_file); # Read in the list of valid docno's for this track # For filtering, use the file that fives the docno order, # which is of the form datestamp docno if ((! -e $docnos_file) || (! open DOCNO_FILE, "<$docnos_file") ) { die "can't open docno's file `$docnos_file': $!\n"; } while ($line = ) { chomp $line; ($datestamp,$d) = split " ", $line; $docnos{$d} = "-1"; } close DOCNO_FILE; # Sort the input file: # for routing: by topic_num, sim # for filtering: by topic_num, docid # then read in each line # ASSUMES UNIX; FOR non-unix, comment out this open, and use # alternate open --- make sure file is sorted! if ($task eq "routing") { open RESULTS, "sort +0 -1 +4 -5nr $results_file |" || die "Unable to open (or sort) results file $results_file: $!\n"; } else { open RESULTS, "sort +0 -1 +2 -3n $results_file |" || die "Unable to open (or sort) results file $results_file: $!\n"; } #open RESULTS, "<$results_file" || # die "Unable to open results file $results_file: $!\n"; $last_i = -1; while ( ($i=index($results_file,"/",$last_i+1)) > -1) { $last_i = $i; } $errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; $q0warn = 0; $num_errors = 0; $line_num = 0; $old_topic = "-1"; while ($line = ) { chomp $line; $line_num++; next if ($line =~ /^\s*$/); undef $tag; ($topic_string,$q0,$docno,$rank,$sim,$tag,$rest) = split " ", $line; if ($rest) { error("Too many fields"); die "\n"; } # make sure runtag is ok if (! $run_id) { # first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9]{1,12}$/) { error("Run tag `$run_id' is malformed"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } if ($topic_string ne $old_topic) { # process change of topic $old_topic = $topic_string; if ($topic_string !~ /^R(\d+)$/) { error("Invalid string ($topic_string) for topic number"); $topic = 0; next; } $topic = $1; if ( ($topic < $MIN_TOPIC) || ($topic > $MAX_TOPIC) ) { error("Unknown topic ($topic_string)"); $topic = 0; next; } } # make sure second field is "Q0" if ($q0 ne "Q0" && ! $q0warn) { $q0warn = 1; error("Field 2 is `$q0' not `Q0'"); } # make sure DOCNO known and not duplicated if (exists $docnos{$docno}) { # valid DOCNO if ($docnos{$docno} eq $topic_string) { error("Document `$docno' retrieved more than once for topic $topic_string"); next; } $docnos{$docno} = $topic_string; } else { # invalid DOCNO error("Unknown document `$docno'"); next; } $num_ret[$topic]++; } # Do global check for routing runs: must retrieve no more than MAX_RET docs # and warn about retrieving fewer than MAX_RET docs if ($task eq "routing") { for ($t=$MIN_TOPIC; $t<=$MAX_TOPIC; $t++) { if ($num_ret[$t] > $MAX_RET) { error("Too many documents ($num_ret[$t]) retrieved for topic R$t"); } if ($num_ret[$t] < $MAX_RET) { print ERRLOG "$0 of results_file: WARNING: only $num_ret[$t] documents retrieved for topic R$t\n"; } } } print ERRLOG "Finished processing $results_file\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; #close(LOGFILE); print STDERR "Finished processing $results_file\n"; if ($num_errors) { exit 255; } exit 0; # print error message, keeping track of total number of errors # line numbers refer to SORTED file since that is the actual input file sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } }