TREC 2025 Proceedings

RAG TREC Instrument for Multilingual Evaluation Report Generation Task Appendix — autoargue-scores.tsv

Runtag Org f1_macro citation_support_macro nugget_coverage_macro
lg_nt_4q12r3l_mt_c (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.6116359675016687 0.9487199063559437 0.49786061764981815
hltime-gpt5.searcher (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.6012049281826225 0.7500727150732944 0.5691143243581325
genaius-question (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) GenAIus 0.595664965979114 0.7054152869892948 0.5610889271976733
lg_nt_4q12r3l_natv_c (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.5916601290813751 0.9323258824501193 0.48359751103274595
hltime-lg.crux (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.5889859885331483 0.7785277533094694 0.5057893345676765
hltime-lg.searcher (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.580770528961303 0.7472888659996118 0.5238329962344259
gptr_nt_q4d4_mt (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.5747102783283856 0.838620271523601 0.4778238471953187
hltime-lg.jina (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.562693084548646 0.7593164712125847 0.4997143052267346
hltime-lg.fsrrfprf (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.5624172754722127 0.7865986347792565 0.4799104802348747
extractive-rag (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  coordinators 0.5620692627640783 0.8109880978105304 0.4982562834684488
auto_swarm_mt (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.5615126905471445 0.9691265235448175 0.4488749094017943
gptr_nt_q3d3_mt (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.5574929275294763 0.8677161092270035 0.4515617325596155
hltime-lg.fsrrf (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.5554640261334303 0.7459730842389544 0.4778676976466541
hltime-lg.listllama (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.5546378496008079 0.7360210689142996 0.4977342343025961
cru-ansR-conf- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5544378156312837 0.9374403614343585 0.43256123082031384
cru-ablR- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5542535305880407 0.9569180432743866 0.42492710753501145
hltime-lg.jina.qwen (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.552340223201879 0.7537534848324344 0.4859814108118151
las_ag_round_robin (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) ncsu-las 0.5478401474553874 0.6917714263819129 0.4995684053244151
cru-ansR-LSR- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5452091137218344 0.9295936949415355 0.4201432980782143
cru-ansR-PlaidX- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5444246297534666 0.919745054975491 0.42054702345803696
cru-ansR- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.54406977815315 0.937926514421104 0.42259107649063943
cru-ablR-LSR- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5437059384529028 0.9408030152264037 0.4175701595724132
hltime-lg.qwen (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-rerank 0.5435705588111771 0.7259137728397043 0.49179255563281693
cru-ablR-PlaidX- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5428722549950794 0.9388390257152311 0.41349266545439833
cru-ablR-conf- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.542674635281006 0.9431825410508052 0.4158050466879687
genaius-cluster (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) GenAIus 0.5389220568042666 0.794557430914786 0.4403416506625166
lg_e2_3q5r3l (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.5309946748704002 0.9370083477104869 0.4262345385194138
las_ag_sel_new_prompt (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) ncsu-las 0.5262719300473939 0.622264469782296 0.5060219466105232
cru-ansR-bareconf- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) HLTCOE 0.5181391782885794 0.7987590321288779 0.4274882716713667
gptr_e2_q3d3_mt (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.5104824480876022 0.8574236673086221 0.39785029888210094
las_ag_sel_29 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) ncsu-las 0.5094960720689344 0.5828733297907803 0.5109970987879155
AMU1ENG (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) AMU 0.5017265044204713 0.840879544459722 0.4219865576636884
las_ag_sel_all_4.1 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) ncsu-las 0.5014348652422956 0.5711891574431229 0.515769554725915
las_ag_sel_28 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) ncsu-las 0.4965300692387417 0.5669994547798982 0.5146095125356646
gptr_ka_q3d3_mt (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.4933206324380592 0.7912061778290961 0.40997721938010595
IDACCS_extract_4.1 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) IDACCS 0.48844664861743453 0.6044668516735712 0.484440712442215
gptr_ka_q3d3_natv (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.48834889528126624 0.7214107108699229 0.4167356500720186
IDACCS_hybrid_4.1 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) IDACCS 0.48716568916404623 0.7619189410374002 0.41311322223325964
WueRAG_2025_08_22 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) WueRAG 0.4794712841625383 0.8318266196510141 0.37437595312071725
IDACCS_hybridtb_4.1 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) IDACCS 0.4754923083749935 0.7569430735949935 0.41032293546464466
IDACCS_nugget_4.1 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) IDACCS 0.4665706185502074 0.5382217030230142 0.5074484889415002
AMU1ML (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) AMU 0.46415219257976165 0.8429157086247833 0.3770463237363619
IDACCS_nugget_tb4.1 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) IDACCS 0.4341585853315322 0.5091303755700415 0.48370308438602005
cru-ansR-mostcommon- (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (paper) HLTCOE 0.3944839169008766 0.9725013117522974 0.2665066225560545
lg_e2_3q5r2l_mt_qw3 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  hltcoe-multiagt 0.2750925048716635 0.41678578993345056 0.3184431273212916
v3_surround_glm4 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  CSU 0.23040114827992822 0.27564270897523696 0.3340811536876689
zetaalpha (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) UvA 0.1400485904637446 0.21706745239386357 0.47642489247786557
v2_split_qwen (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  CSU 0.12269772852261011 0.2452996044526594 0.24988181992939776
tblocal (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.10499569308028754 0.5073529411764706 0.06526538375928286
xenc-report (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.09544652829743848 0.49313725490196075 0.06574697990651368
eng_mlm6 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.09544652829743848 0.49313725490196075 0.06574697990651368
eng_mlm6loc (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.09544652829743848 0.49313725490196075 0.06574697990651368
mlm12 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.024425770308123245 0.4916666666666667 0.01693680568202513
v1_qwen (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  CSU 0.01948331353838237 0.12577152728212537 0.22050361565021917
eng_fused (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.016875138802323997 0.5049019607843137 0.00926857585139319
pybm25 (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.016875138802323997 0.9705882352941176 0.00926857585139319
electra (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.003419972640218878 0.4754901960784314 0.001838235294117647
tb (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.0027746947835738073 0.47058823529411764 0.0014705882352941176
dfki-milp-base (autoargue)  DFKI 0.0 0.0 0.0
milp-query-expanded (autoargue)  DFKI 0.0 0.0 0.0
mlir-rrf-report (autoargue)  (almost-human-judgments.tsv)  (almost-human-scores.tsv)  (autoargue-scores.tsv)  (autoargue-judgments.jsonl)  (paper) DUTH 0.0 0.48039215686274517 0.0