#include #include #include #include #include /* ==> ../submissions.joined/DUTHlrgA <== 200 3.1131864.J0J3TR2N0RG3S14J0YZZGLYEO0TKDI5BA 1 0.998028 DUTHlrgA 100 1 */ /* ==> qrel_corr_file ==> ct ==> strata fpr fnr */ /* R numbers 200 2543.52 201 2366.28 202 4615.27 203 4944.23 204 6361.83 205 67438.43 206 929.09 207 20929.17 */ int n, b, top, batch, rel, oldtop, Rel[5][3],RT[16][5][3],ct; double score, auc, relsp, nrelsp, rels, nrels, estrel, bestF, Nrels[1000000], Rels[1000000], Estrel[1000000],Relsp[1000000], Nrelsp[1000000]; double inc; double corr[5][2]; FILE* corrf=NULL; #define FP 0 #define FN 1 stats(){ int i,j,k; rels = nrels = 0; for (i=1;i<=4;i++) { if (Rel[i][0] || Rel[i][1]) { double q_rels = (double)Rel[i][1]/(Rel[i][0]+Rel[i][1])*(Rel[i][-1]+Rel[i][0]+Rel[i][1]); double q_nrels = (double)Rel[i][0]/(Rel[i][0]+Rel[i][1])*(Rel[i][-1]+Rel[i][0]+Rel[i][1]); // true_rels = [(fp-1)rels + (fp)nrels]/(fn+fp-1) // true_nrels = [(fn(rels+nrels) - nrels)/(fp+fn-1)] // this is ugly hack if( corr[i][FP] + corr[i][FN] == 1 ) { rels += q_rels; nrels += q_nrels; } else { double true_rels = ((corr[i][FP]-1.0)*q_rels + corr[i][FP]*q_nrels)/(corr[i][FP]+corr[i][FN]-1); double true_nrels = (corr[i][FN]*(q_rels+q_nrels) - q_nrels)/(corr[i][FP]+corr[i][FN]-1); rels += true_rels < 0 ? q_rels : true_rels; nrels += true_nrels < 0 ? q_nrels : true_nrels; } } } } void relpast() { int j; relsp = nrelsp = 0; for(j=1;j<=4;j++) { double srel = RT[ct][j][1] - Rel[j][1]; double snrel = RT[ct][j][0] - Rel[j][0]; double sunj = RT[ct][j][-1] - Rel[j][-1]; if( srel || snrel ) { double q_rels = srel/(srel+snrel)*(srel+snrel+sunj); double q_nrels = snrel/(srel+snrel)*(srel+snrel+sunj); // true_rels = [(fp-1)rels + (fp)nrels]/(fn+fp-1) // true_nrels = [(fn(rels+nrels) - nrels)/(fp+fn-1)] if( corr[j][FP] + corr[j][FN] == 1 ) { relsp += q_rels; nrelsp += q_nrels; } else { double true_rels = ((corr[j][FP]-1.0)*q_rels + corr[j][FP]*q_nrels)/(corr[j][FP]+corr[j][FN]-1); double true_nrels = (corr[j][FN]*(q_rels+q_nrels) - q_nrels)/(corr[j][FP]+corr[j][FN]-1); relsp += true_rels < 0 ? q_rels : true_rels; nrelsp += true_nrels < 0? q_nrels : true_nrels; } } } return; } double errcalc(double a, double b) { if (b > a) return errcalc(b,a); return 100 - 100 * (a - b)/a; } doit(){ stats(); if (oldtop) { int i,j,bn=0,bnest=0; double P,R,F,NR,estP,estR,estF,bF=0,bestF=0, bFest=0; printf("==== Topic %d ====\n",oldtop); printf("Documents returned Relevant Nonrelevant Estmated Rel Estimated Non\n"); for (i=1;i<=n;i++) { P = Rels[i]/i; R = Rels[i]/(Rels[i]+Relsp[i]); F = 2/(1/P+1/R); estP = Estrel[i]/i; estR = Estrel[i]/Estrel[n]; estF = 2/(1/estP+1/estR); if (F > bF) { bF = F; bn = i; } if (estF > bestF) { bestF = estF; bFest = F; bnest = i; } NR = Nrels[i]/(Nrels[i]+Nrelsp[i]); if (i == 10 || i == 100 || i == 1000 || i == 100000 || i == 20 || i == 200 || i == 2000 || i == 20000 || i == 200000|| i == 500000 || i == 50 || i == 500 || i == 5000 || i == 50000 || i == n) { printf("%7d (%5.3lf) %7.0lf (%5.3lf) %7.0lf (%5.3lf) %7.0lf (%5.3lf) %7.0lf (%5.3lf)\n", i,(double)i/n, Rels[i],R, Nrels[i],NR, Estrel[i],estR, i-Estrel[i],(i-Estrel[i])/(n-Estrel[n])); } } ct++; //printf("%d Rel %0.0lf estRel %0.0lf acc %0.1lf%% possible F1 %0.1lf%% estimated %0.1lf%% actual F1 %0.1lf%% Ferr %0.1lf%% %d %d %0.1lf%%\n", oldtop,Rels[n],Estrel[n],errcalc(Rels[n],Estrel[n]),100*bF,100*bestF,100*bFest,errcalc(bestF,bFest),bn,bnest,errcalc(bn,bnest)); } bestF = 0; estrel = 0; n = 0; oldtop = top; memset(Rel,0,sizeof(Rel)); memset(corr,0,sizeof(corr)); // read in fpr/tpr for new topic int ct; if( corrf && fscanf(corrf,"%d",&ct) == 1) { while( ct -- ) { int str; double fpr, fnr; fscanf(corrf,"%d %lf %lf",&str,&fpr,&fnr); if(str == 100) str = 1; else if( str == 1000 ) str = 2; else if( str == 10000 ) str = 3; else if( str == 1000000 ) str = 4; else { fprintf(stderr,"oops - bad batch %d in corr file\n",str); if(corrf) fclose(corrf); exit(1); } corr[str][FP] = fpr; corr[str][FN] = fnr; } } } main(int argc, char *argv[]){ if( argc > 1 ) corrf = fopen(argv[1],"r"); FILE* tmp = tmpfile( ); while (4 == scanf("%d%*s%*s%lf%*s%d%d",&top,&score,&batch,&rel)) { fprintf(tmp,"%d %lf %d %d\n",top,score,batch,rel); if( oldtop && top != oldtop ) ct++; if(batch == 100) b = 1; else if(batch == 1000 ) b = 2; else if(batch == 10000 ) b = 3; else if(batch == 1000000 ) b = 4; else {fprintf(stderr,"oops - bad batch %d\n",batch); if(corrf) fclose(corrf); fclose(tmp); return 1;} RT[ct][b][rel] ++; oldtop=top; } fseek(tmp,0,SEEK_SET); ct = 0;oldtop=0; while (4 == fscanf(tmp,"%d %lf %d %d",&top,&score,&batch,&rel)) { if (top != oldtop) {doit();} if (batch == 100) b = 1; else if (batch == 1000) b = 2; else if (batch == 10000) b = 3; else if (batch == 1000000) b = 4; else { fprintf(stderr, "oops - bad batch %d\n", batch); if(corrf) fclose(corrf); fclose(tmp); return 1; } Rel[b][rel]++; n++; estrel += score; Estrel[n] = estrel; stats(); relpast(); Rels[n] = rels; Relsp[n] = relsp; Nrels[n] = nrels; Nrelsp[n] = nrelsp; //double F = 100.0*2/(R[top-200]/rels+(double)n/rels); //printf("%d n %d estrel %0.1lf rel %0.1lf prec %0.3lf recall %0.3lf F %0.3lf\n", //top,n,estrel,rels,100.0*rels/n,100.0*rels/R[top-200],F); //if (F > bestF) bestF = F; //printf("%d %0.6lf %d %d\n",top,score,batch,rel); } doit(); fclose(tmp); if( corrf ) fclose(corrf); return 0; }