# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"
import os
cwd = os.getcwd()
# Configuration Information
# Do we try and normalize the run priority scores?
enablePriorityNorm = True
# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5
taskCategories = [
"CallToAction-Donations",
"CallToAction-MovePeople",
"CallToAction-Volunteer",
"Other-Advice",
"Other-ContextualInformation",
"Other-Discussion",
"Other-Irrelevant",
"Other-Sentiment",
"Report-CleanUp",
"Report-EmergingThreats",
"Report-Factoid",
"Report-FirstPartyObservation",
"Report-Hashtags",
"Report-Location",
"Report-MultimediaShare",
"Report-News",
"Report-NewSubEvent",
"Report-Official",
"Report-OriginalEvent",
"Report-ServiceAvailable",
"Report-ThirdPartyObservation",
"Report-Weather",
"Request-GoodsServices",
"Request-InformationWanted",
"Request-SearchAndRescue",
]
# What we consider to be highly important categories of information
highImportCategories = [
"Request-GoodsServices",
"Request-SearchAndRescue",
"CallToAction-MovePeople",
"Report-EmergingThreats",
"Report-NewSubEvent",
"Report-ServiceAvailable"
]
highImportCategoriesShort = [
"GoodsServices",
"SearchAndRescue",
"MovePeople",
"EmergingThreats",
"NewSubEvent",
"ServiceAvailable"
]
# Priority map
priorityScoreMap = {
"Critical": 1.0,
"High": 0.75,
"Medium": 0.5,
"Low": 0.25,
"Unknown": 0.25,
}
# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected
# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
# '2015_09_28_hurricane_joaquin.2015',
# '2017_03_23_cyclone_debbie.2017',
# '2018_02_24_anticyclone_hartmut.2018',
# '2018_07_13_ferguson_wildfire.2018',
# '2018_07_23_cranston_wildfire.2018',
# '2018_09_07_hurricane_florence.2018',
# '2018_10_07_hurricane_michael.2018',
# '2019_09_17_tropicalstorm_imelda.2019',
# '2019_karnataka_floods',
# '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
# '2020_01_28_bar_shooting_nc.2020',
# '2020_02_07_rutherford_tn_floods.2020',
# '2020_05_26_edenville_dam_failure.2020.corrected',
# '2020_08_27_hurricane_laura.2020',
# '2020_09_11_hurricane_sally.2020',
# '2020_afghanistan_flood',
# '2020_hpakant_jade_mine_disaster',
# '2020_kerala_floods',
# 'T2020_02_03_texas_university_shooting.2020',
# 'UNASSIGNED',
# 'indonesia_earthquake.2019'
"2020_05_26_edenville_dam_failure.2020.corrected",
"2018_10_07_hurricane_michael.2018",
"2020_01_28_bar_shooting_nc.2020",
"T2020_02_03_texas_university_shooting.2020",
"2020_02_07_rutherford_tn_floods.2020",
"UNASSIGNED",
"indonesia_earthquake.2019",
"2015_09_28_hurricane_joaquin.2015",
"2017_03_23_cyclone_debbie.2017",
"2018_02_24_anticyclone_hartmut.2018",
"2018_07_13_ferguson_wildfire.2018",
"2018_07_23_cranston_wildfire.2018",
"2018_09_07_hurricane_florence.2018",
"2019_09_17_tropicalstorm_imelda.2019",
"2019_karnataka_floods",
"2019_spring_floods_in_ontario_quebec_and_new_brunswick",
"2020_08_27_hurricane_laura.2020",
"2020_09_11_hurricane_sally.2020",
"2020_afghanistan_flood",
"2020_hpakant_jade_mine_disaster",
"2020_kerala_floods",
]
import glob
runFile = None
for f in glob.glob("*.gz"):
runFile = f
print("Run File:", f)
Run File: run.json.gz
import gzip
import json
runName = None
with gzip.open(runFile, "r") as inRunFile:
for line in inRunFile:
line = line.decode("utf8")
# runName = line.rpartition("\t")[2].strip()
runName = json.loads(line)["runtag"]
break
print("Run Name:", runName)
Run Name: njit_label_prop
# Do we try and normalize the run priority scores?
enablePriorityNorm = False
dataDir = "../../data/2021b"
# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir
# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
# "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
# "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
# "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
"%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]
# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir
topicArray = []
with open(topicsFile, "r") as inTopicsFile:
topicNum = None
topicDataset = None
for line_ in inTopicsFile:
line = line_.strip()
if line == "</top>":
if topicDataset in skipEvents:
continue
topicArray.append((topicDataset, topicNum))
if line.startswith("<num>"):
topicNum = line.partition("<num>")[2].partition("</num>")[0]
if line.startswith("<dataset>"):
topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
for row in topicArray:
print(row)
('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076') ('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080') ('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081') ('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082') ('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083') ('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085') ('2016_puttingal_temple', 'TRECIS-CTIT-H-089') ('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091') ('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092') ('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096') ('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097') ('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100') ('2018_maryland_flood', 'TRECIS-CTIT-H-101') ('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102') ('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103') ('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104') ('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106') ('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107') ('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108') ('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110') ('2019_townsville_flood', 'TRECIS-CTIT-H-112') ('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116') ('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119') ('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120') ('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121') ('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')
# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())
resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")
perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")
perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")
1
# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
# --------------------------------------------------
# Stage 1: Load the ground truth dataset
# --------------------------------------------------
groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
print("Reading "+groundtruthFile)
with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:
groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])
# --------------------------------------------------
# Stage 2: Load run file
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
# runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])
Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json
# --------------------------------------------------
# Stage 3: Load the categories
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:
ontologyJSON = json.load(ontologyJSONFile)
informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index
for informationTypeJSON in ontologyJSON["informationTypes"]:
informationTypeId = informationTypeJSON["id"]
informationTypeIndex = taskCategories.index(informationTypeId)
informationTypes2Index[informationTypeId] = informationTypeIndex
informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex
# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
# not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events
tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []
invertedPriorityScoreMap = {
v:k for k,v in priorityScoreMap.items()
}
tweetIndex = 0
for groundtruth in groundtruthJSON:
for eventJSON in groundtruth["events"]:
eventid = eventJSON["eventid"]
print(eventid)
if eventid in skipEvents:
continue
if not event2tweetIds.get(eventid):
event2tweetIds[eventid] = []
if any(eventid in s for s in eventIdentifiers):
# iterate over tweets in the event
for tweetJSON in eventJSON["tweets"]:
tweetid = tweetJSON["postID"]
categories = tweetJSON["postCategories"]
priority = tweetJSON["postPriority"]
if priority == "High" or priority == "Critical":
countHighCriticalImport = countHighCriticalImport + 1
if priority == "Low" or priority == "Medium":
countLowMediumImport = countLowMediumImport + 1
# check categories for name issues and correct if possible
cleanedCategories = []
highImportCats = []
lowImportCats = []
for categoryId in categories:
if not any(categoryId in s for s in informationTypesShort2Index.keys()):
# print("Found unknown category in ground truth "+categoryId+", ignoring...")
pass
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategoriesShort):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
if tweetid not in tweetsSeen:
event2tweetIds[eventid].append(tweetid)
tweetId2TRECInfoCategories[tweetid] = cleanedCategories
tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
tweetId2TRECPriorityCategory[tweetid] = priority
index2TweetId[tweetIndex] = tweetid;
tweetIndex = tweetIndex + 1
tweetsSeen.append(tweetid)
else:
tweetId2TRECInfoCategories[tweetid] = list(set(
cleanedCategories + tweetId2TRECInfoCategories[tweetid]
))
prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
thisPriorityScore = priorityScoreMap[priority]
tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
max(prePriorityScore, thisPriorityScore)
]
else:
print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")
2020_01_27_houston_explosion.2020 2020_01_28_bar_shooting_nc.2020 T2020_02_03_texas_university_shooting.2020 2020_02_07_rutherford_tn_floods.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 2020_05_26_edenville_dam_failure.2020.corrected brooklynblockparty_shooting.2019 UNASSIGNED indonesia_earthquake.2019 2015_09_28_hurricane_joaquin.2015 2016_puttingal_temple 2017_03_23_cyclone_debbie.2017 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_02_24_anticyclone_hartmut.2018 2018_07_13_ferguson_wildfire.2018 2018_07_23_cranston_wildfire.2018 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_09_07_hurricane_florence.2018 2018_10_07_hurricane_michael.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_09_17_tropicalstorm_imelda.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_karnataka_floods 2019_saugus_high_school_shooting 2019_spring_floods_in_ontario_quebec_and_new_brunswick 2019_townsville_flood 2020_08_27_hurricane_laura.2020 2020_09_11_hurricane_sally.2020 2020_afghanistan_flood 2020_easter_tornado_outbreak 2020_hpakant_jade_mine_disaster 2020_kerala_floods 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)
maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999
for predictionParts in runContents:
#print(runLine)
if (len(predictionParts)<6 ):
print(runLine)
continue
else:
eventId = predictionParts["topic"]
if eventId in skipEvents:
continue
tweetId = predictionParts["tweet_id"]
rank = 0
#print(predictionParts[5])
category_scores = predictionParts["info_type_scores"]
category_labels = predictionParts["info_type_labels"]
priority = float(predictionParts["priority"])
if priority > maxPrediction:
maxPrediction = priority
if priority < minPrediction:
minPrediction = priority
cleanedCategories = []
cleanedCategoriesProbs = []
highImportCats = []
lowImportCats = []
# Handle category flags
for catIndex, categoryLabel in enumerate(category_labels):
# check if we have a binary flag for this label
if categoryLabel == 0:
# False flag, so skip
continue
categoryId = taskCategories[catIndex]
if not any(categoryId in s for s in informationTypes2Index.keys()):
print("Found unknown category in run "+categoryId+", ignoring...")
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategories):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
# Process category probabilities
for categoryProbability in category_scores:
if categoryProbability > maxCategory:
maxCategory = categoryProbability
if categoryProbability < minCategory:
minCategory = categoryProbability
cleanedCategoriesProbs.append(categoryProbability)
tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
tweetId2RunInfoCategories[tweetId] = cleanedCategories
tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
tweetId2RunPriorityScore[tweetId] = priority
if priority > priorityScoreMap["High"]:
tweetId2RunPriorityCategory[tweetId] = "Critical"
elif priority > priorityScoreMap["Medium"]:
tweetId2RunPriorityCategory[tweetId] = "High"
elif priority > priorityScoreMap["Low"]:
tweetId2RunPriorityCategory[tweetId] = "Medium"
else:
tweetId2RunPriorityCategory[tweetId] = "Low"
if not event2TweetIdRank.get(eventId):
event2TweetIdRank[eventId] = []
rankTuple = (tweetId,rank)
event2TweetIdRank.get(eventId).append(rankTuple)
for eventId in event2TweetIdRank.keys():
tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
event2TweetIdRank[eventId] = tweetsSorted
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunPriorityScore.get(tweetId):
if enablePriorityNorm:
if (minPrediction-minPrediction) == 0.0:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
else:
tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
else:
tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
else:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------
category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
#pprint(categories)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
category2GroundTruth[categoryId] = categoryVector
#pprint(category2GroundTruth)
# --------------------------------------------------
# Stage 7: Create run vectors per category
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
# no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunInfoCategories.get(tweetId):
categories = tweetId2RunInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
else:
categoryVector.append(0)
category2Predicted[categoryId] = categoryVector
#pprint(category2Predicted)
# --------------------------------------------------
# Stage 8: Make event category vectors
# --------------------------------------------------
event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(eventId)
for tweetId in event2tweetIds.get(eventId):
# print(tweetId)
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2groundtruth[eventId] = eventCategories
event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
print(eventId)
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(tweetId)
for tweetId in event2tweetIds.get(eventId):
#print(tweetId)
categories = tweetId2RunInfoCategories.get(tweetId)
if categories == None:
categories = json.loads("[]")
tweetId2RunInfoCategories[tweetId] = categories
if any(categoryId in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2prediction[eventId] = eventCategories
2020_01_27_houston_explosion.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 brooklynblockparty_shooting.2019 2016_puttingal_temple 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_saugus_high_school_shooting 2019_townsville_flood 2020_easter_tornado_outbreak 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------
category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
priorityVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
priority = tweetId2TRECPriorityCategory.get(tweetId)
priorityVector.append(priority)
category2GroundTruthPriority[categoryId] = priorityVector
category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
categoryScoreVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
if tweetId2RunPriorityCategory.get(tweetId):
priority = tweetId2RunPriorityCategory.get(tweetId)
priorityScore = tweetId2RunPriorityScore.get(tweetId)
categoryVector.append(priority)
categoryScoreVector.append(priorityScore)
else:
categoryVector.append("Low") # default to low priority
categoryScoreVector.append(0.25)
category2PredictedPriority[categoryId] = categoryVector
category2PredictedPriorityScore[categoryId] = categoryScoreVector
# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories
# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------
import pandas as pd
def calc_dcg(scores, at_k=100):
position = 1
accumulator = 0.0
for score in scores[:at_k]:
numerator = 2 ** score - 1
denom = np.log2(position + 1)
accumulator += numerator / denom
position += 1
return accumulator
priority_map = {
"Unknown": 1,
"Low": 1,
"Medium": 2,
"High": 3,
"Critical": 4,
}
at_k = 100
tweetId2TRECPriorityCategory_score = {
k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
tweetId2TRECPriorityCategory_score.values(),
reverse=True
)
best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print(event)
tweetId2TRECPriorityCategory_scores_sorted = sorted(
[tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
reverse=True
)
ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
print("\tBest DCG:", ideal_dcg)
best_dcg_per_event[event] = ideal_dcg
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()
# Code below calculates the DCG for a system's
# ranked priority tweets. We have to do some
# sampling here to break ties among tweets with
# the same priority scores.
# Build a dataframe from the system's provided
# priority scores, so we can identify what the
# top-most priorities are and get a count of
# the number of tweets in each priority bin.
priority_df = pd.DataFrame(
[(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
columns=["tweet_id", "priority"]
)
# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print("Event:", event)
local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
unique_scores = local_priority_df["priority"].value_counts()
# Find the top priority scores that would be included
# in the necessary at_k values.
total = 0
top_keys = []
candidates = {}
for top in sorted(unique_scores.index, reverse=True):
# We store this key, so we can go back and shuffle
#. tweets with this score.
top_keys.append(top)
local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
candidates[top] = list(local_restricted_df["tweet_id"])
total += local_restricted_df.shape[0]
# Once we have enough samples, stop.
if ( total > at_k ):
break
# Now we generate distribution over the DCG for this
# system and do this a number of times to remove
# dependence on our selection of the top k tweets
random_dcgs = []
for i in range(100):
local_tweet_ids = []
for top in top_keys:
this_top_tweets = candidates[top][:]
np.random.shuffle(this_top_tweets)
needed = at_k - len(local_tweet_ids)
local_tweet_ids.extend(this_top_tweets[:needed])
local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]
random_dcgs.append(calc_dcg(local_scores))
system_dcg = np.mean(random_dcgs)
system_ndcg_ = system_dcg / best_dcg_per_event[event]
print("\tnDCG:", system_ndcg_)
system_dcg_per_event[event] = system_ndcg_
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")
2020_01_27_houston_explosion.2020 Best DCG: 176.99559032459564 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Best DCG: 268.88459894996123 2020_02_10_mideast_tornadoes.day2_al.2020 Best DCG: 270.1716952398847 2020_02_10_mideast_tornadoes.day3_md.2019 Best DCG: 135.38775246204446 2020_05_06_tn_derecho.2020 Best DCG: 167.06354661312534 brooklynblockparty_shooting.2019 Best DCG: 179.1756130795261 2016_puttingal_temple Best DCG: 314.08006311421406 2017_12_04_thomas_wildfire.2017 Best DCG: 300.71399384300895 2017_12_07_lilac_wildfire.2017 Best DCG: 314.08006311421406 2018_07_23_klamathon_wildfire.2018 Best DCG: 221.46334445469358 2018_08_05_holy_wildfire.2018 Best DCG: 153.96993418707177 2018_11_07_Woolsey_wildfire.2018 Best DCG: 175.67469323453255 2018_maryland_flood Best DCG: 285.7119531591263 2018_pittsburgh_synagogue_shooting Best DCG: 111.85075929877581 2019_03_01_alberta_wildfire.2019.v2 Best DCG: 62.88708564345522 2019_08_25_hurricane_dorian.2019 Best DCG: 146.57069611996656 2019_10_10_saddleridge_wildfire.2019 Best DCG: 173.00802656786584 2019_10_25_kincade_wildfire.2019 Best DCG: 314.08006311421406 2019_durham_gas_explosion Best DCG: 201.07148118577902 2019_saugus_high_school_shooting Best DCG: 314.08006311421406 2019_townsville_flood Best DCG: 314.08006311421406 2020_easter_tornado_outbreak Best DCG: 214.9714167256293 2020_tornado_outbreak_of_april Best DCG: 314.08006311421406 2020_tornado_outbreak_of_march Best DCG: 267.51977363880474 2020_visakhapatnam_gas_leak Best DCG: 314.08006311421406 tornado_outbreak_of_november_30_december_2018 Best DCG: 314.08006311421406 Mean: 231.7589407554446 Event: 2020_01_27_houston_explosion.2020 nDCG: 0.27780543551693 Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020 nDCG: 0.4220503544399323 Event: 2020_02_10_mideast_tornadoes.day2_al.2020 nDCG: 0.3745338181274062 Event: 2020_02_10_mideast_tornadoes.day3_md.2019 nDCG: 0.4061232730457823 Event: 2020_05_06_tn_derecho.2020 nDCG: 0.43527383514448037 Event: brooklynblockparty_shooting.2019 nDCG: 0.17289493977359197 Event: 2016_puttingal_temple nDCG: 0.2527608591013886 Event: 2017_12_04_thomas_wildfire.2017 nDCG: 0.35332605501113173 Event: 2017_12_07_lilac_wildfire.2017 nDCG: 0.3656410206279158 Event: 2018_07_23_klamathon_wildfire.2018 nDCG: 0.49783284897545427 Event: 2018_08_05_holy_wildfire.2018 nDCG: 0.452843321365631 Event: 2018_11_07_Woolsey_wildfire.2018 nDCG: 0.31769939055888796 Event: 2018_maryland_flood nDCG: 0.3458532632045162 Event: 2018_pittsburgh_synagogue_shooting nDCG: 0.8796077371132954 Event: 2019_03_01_alberta_wildfire.2019.v2 nDCG: 0.3567178481823365 Event: 2019_08_25_hurricane_dorian.2019 nDCG: 0.35378918970888484 Event: 2019_10_10_saddleridge_wildfire.2019 nDCG: 0.4265909143894533 Event: 2019_10_25_kincade_wildfire.2019 nDCG: 0.6245649491090363 Event: 2019_durham_gas_explosion nDCG: 0.2920285260422452 Event: 2019_saugus_high_school_shooting nDCG: 0.28266031580255163 Event: 2019_townsville_flood nDCG: 0.6049019612679973 Event: 2020_easter_tornado_outbreak nDCG: 0.40805012915193695 Event: 2020_tornado_outbreak_of_april nDCG: 0.5689650450614729 Event: 2020_tornado_outbreak_of_march nDCG: 0.19530535717066963 Event: 2020_visakhapatnam_gas_leak nDCG: 0.5444961776633392 Event: tornado_outbreak_of_november_30_december_2018 nDCG: 0.7454005185324978 System Event-Micro nDCG: 0.42145065708033713
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0
avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0
avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0
for categoryId in informationTypes2Index.keys():
categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
avgPrecision = avgPrecision + categoryPrecision
avgRecall = avgRecall + categoryRecall
avgF1 = avgF1 + categoryF1
avgAccuracy = avgAccuracy + categoryAccuracy
if any(categoryId in s for s in highImportCategories):
avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
avgRecallHigh = avgRecallHigh + categoryRecall
avgF1High = avgF1High + categoryF1
avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
else:
avgPrecisionLow = avgPrecisionLow + categoryPrecision
avgRecallLow = avgRecallLow + categoryRecall
avgF1Low = avgF1Low + categoryF1
avgAccuracyLow = avgAccuracyLow + categoryAccuracy
numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))
print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))
print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")
Information Type Precision (positive class, multi-type, macro): 0.26161177246515116 Information Type Recall (positive class, multi-type, macro): 0.2935170483437286 Information Type F1 (positive class, multi-type, macro): 0.26004632897554286 Information Type Accuracy (overall, multi-type, macro): 0.8964102931432766 High Importance Information Type Precision (positive class, multi-type, macro): 0.23731530472421372 High Importance Information Type Recall (positive class, multi-type, macro): 0.1968659208249782 High Importance Information Type F1 (positive class, multi-type, macro): 0.20079776340090597 High Importance Information Type Accuracy (overall, multi-type, macro): 0.9615588942809902 Low Importance Information Type Precision (positive class, multi-type, macro): 0.2692843412254472 Low Importance Information Type Recall (positive class, multi-type, macro): 0.3240384570338603 Low Importance Information Type F1 (positive class, multi-type, macro): 0.27875640231490195 Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8758370506787354
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here,
# primaraly with respect to recall, as we want the user to
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)
from sklearn.metrics import classification_report
perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")
for categoryId in informationTypes2Index.keys():
target_names = ['Other Classes', categoryId]
try:
print(categoryId)
print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))
perTopicFile.write(categoryId+"\n")
perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
perTopicFile.write(""+"\n")
except ValueError:
print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")
CallToAction-Donations precision recall f1-score support Other Classes 0.99 0.99 0.99 55275 CallToAction-Donations 0.30 0.47 0.37 568 accuracy 0.98 55843 macro avg 0.65 0.73 0.68 55843 weighted avg 0.99 0.98 0.99 55843 CallToAction-MovePeople precision recall f1-score support Other Classes 0.98 0.99 0.99 54646 CallToAction-MovePeople 0.42 0.24 0.31 1197 accuracy 0.98 55843 macro avg 0.70 0.62 0.65 55843 weighted avg 0.97 0.98 0.97 55843 CallToAction-Volunteer precision recall f1-score support Other Classes 1.00 1.00 1.00 55543 CallToAction-Volunteer 0.25 0.18 0.21 300 accuracy 0.99 55843 macro avg 0.62 0.59 0.60 55843 weighted avg 0.99 0.99 0.99 55843 Other-Advice precision recall f1-score support Other Classes 0.96 0.97 0.96 52602 Other-Advice 0.38 0.33 0.35 3241 accuracy 0.93 55843 macro avg 0.67 0.65 0.66 55843 weighted avg 0.93 0.93 0.93 55843 Other-ContextualInformation precision recall f1-score support Other Classes 0.97 0.95 0.96 54346 Other-ContextualInformation 0.02 0.04 0.03 1497 accuracy 0.93 55843 macro avg 0.50 0.50 0.50 55843 weighted avg 0.95 0.93 0.94 55843 Other-Discussion precision recall f1-score support Other Classes 0.99 0.95 0.97 55263 Other-Discussion 0.03 0.14 0.05 580 accuracy 0.95 55843 macro avg 0.51 0.55 0.51 55843 weighted avg 0.98 0.95 0.96 55843 Other-Irrelevant precision recall f1-score support Other Classes 0.53 0.82 0.64 23267 Other-Irrelevant 0.79 0.48 0.60 32576 accuracy 0.62 55843 macro avg 0.66 0.65 0.62 55843 weighted avg 0.68 0.62 0.62 55843 Other-Sentiment precision recall f1-score support Other Classes 0.94 0.93 0.94 51270 Other-Sentiment 0.31 0.34 0.32 4573 accuracy 0.88 55843 macro avg 0.62 0.63 0.63 55843 weighted avg 0.89 0.88 0.89 55843 Report-CleanUp precision recall f1-score support Other Classes 1.00 0.99 0.99 55581 Report-CleanUp 0.13 0.30 0.18 262 accuracy 0.99 55843 macro avg 0.56 0.65 0.59 55843 weighted avg 0.99 0.99 0.99 55843 Report-EmergingThreats precision recall f1-score support Other Classes 0.96 0.92 0.94 52454 Report-EmergingThreats 0.23 0.36 0.28 3389 accuracy 0.89 55843 macro avg 0.59 0.64 0.61 55843 weighted avg 0.91 0.89 0.90 55843 Report-Factoid precision recall f1-score support Other Classes 0.93 0.94 0.94 49844 Report-Factoid 0.48 0.45 0.46 5999 accuracy 0.89 55843 macro avg 0.71 0.69 0.70 55843 weighted avg 0.88 0.89 0.89 55843 Report-FirstPartyObservation precision recall f1-score support Other Classes 0.97 0.96 0.97 54135 Report-FirstPartyObservation 0.10 0.14 0.11 1708 accuracy 0.93 55843 macro avg 0.53 0.55 0.54 55843 weighted avg 0.95 0.93 0.94 55843 Report-Hashtags precision recall f1-score support Other Classes 0.89 0.89 0.89 48407 Report-Hashtags 0.31 0.32 0.31 7436 accuracy 0.81 55843 macro avg 0.60 0.60 0.60 55843 weighted avg 0.82 0.81 0.82 55843 Report-Location precision recall f1-score support Other Classes 0.84 0.74 0.78 41325 Report-Location 0.44 0.59 0.50 14518 accuracy 0.70 55843 macro avg 0.64 0.66 0.64 55843 weighted avg 0.73 0.70 0.71 55843 Report-MultimediaShare precision recall f1-score support Other Classes 0.92 0.75 0.83 48784 Report-MultimediaShare 0.24 0.54 0.33 7059 accuracy 0.73 55843 macro avg 0.58 0.65 0.58 55843 weighted avg 0.83 0.73 0.77 55843 Report-News precision recall f1-score support Other Classes 0.93 0.82 0.87 50324 Report-News 0.23 0.47 0.31 5519 accuracy 0.79 55843 macro avg 0.58 0.65 0.59 55843 weighted avg 0.86 0.79 0.82 55843 Report-NewSubEvent precision recall f1-score support Other Classes 0.98 0.98 0.98 54728 Report-NewSubEvent 0.05 0.07 0.06 1115 accuracy 0.96 55843 macro avg 0.52 0.52 0.52 55843 weighted avg 0.96 0.96 0.96 55843 Report-Official precision recall f1-score support Other Classes 0.96 0.98 0.97 53203 Report-Official 0.20 0.11 0.14 2640 accuracy 0.94 55843 macro avg 0.58 0.54 0.55 55843 weighted avg 0.92 0.94 0.93 55843 Report-OriginalEvent precision recall f1-score support Other Classes 0.95 0.98 0.97 52838 Report-OriginalEvent 0.10 0.03 0.04 3005 accuracy 0.93 55843 macro avg 0.52 0.51 0.50 55843 weighted avg 0.90 0.93 0.92 55843 Report-ServiceAvailable precision recall f1-score support Other Classes 0.97 0.98 0.98 53834 Report-ServiceAvailable 0.40 0.32 0.35 2009 accuracy 0.96 55843 macro avg 0.68 0.65 0.66 55843 weighted avg 0.95 0.96 0.96 55843 Report-ThirdPartyObservation precision recall f1-score support Other Classes 0.91 0.84 0.87 50379 Report-ThirdPartyObservation 0.16 0.28 0.20 5464 accuracy 0.78 55843 macro avg 0.54 0.56 0.54 55843 weighted avg 0.84 0.78 0.81 55843 Report-Weather precision recall f1-score support Other Classes 0.97 0.90 0.93 50824 Report-Weather 0.41 0.70 0.52 5019 accuracy 0.88 55843 macro avg 0.69 0.80 0.73 55843 weighted avg 0.92 0.88 0.90 55843 Request-GoodsServices precision recall f1-score support Other Classes 0.99 1.00 1.00 55452 Request-GoodsServices 0.23 0.05 0.09 391 accuracy 0.99 55843 macro avg 0.61 0.53 0.54 55843 weighted avg 0.99 0.99 0.99 55843 Request-InformationWanted precision recall f1-score support Other Classes 0.99 0.99 0.99 55241 Request-InformationWanted 0.26 0.26 0.26 602 accuracy 0.98 55843 macro avg 0.63 0.63 0.63 55843 weighted avg 0.98 0.98 0.98 55843 Request-SearchAndRescue precision recall f1-score support Other Classes 1.00 1.00 1.00 55737 Request-SearchAndRescue 0.10 0.14 0.12 106 accuracy 1.00 55843 macro avg 0.55 0.57 0.56 55843 weighted avg 1.00 1.00 1.00 55843
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)
N = len(informationTypes2Index)
ind = np.arange(N)
scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
print(categoryId, localF1Score)
scoresPerCategoryF1.append(localF1Score)
categoryLabels.append(categoryId)
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerCategoryF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
CallToAction-Donations 0.36598639455782306 CallToAction-MovePeople 0.3075302790942601 CallToAction-Volunteer 0.20662768031189085 Other-Advice 0.35401157981803144 Other-ContextualInformation 0.031118794764139292 Other-Discussion 0.05097917314267951 Other-Irrelevant 0.5961237648315593 Other-Sentiment 0.32074484778742546 Report-CleanUp 0.18036529680365296 Report-EmergingThreats 0.28058877644894203 Report-Factoid 0.46172434625010783 Report-FirstPartyObservation 0.11279097672186224 Report-Hashtags 0.3121103594641199 Report-Location 0.5031787143866259 Report-MultimediaShare 0.3339574171637567 Report-News 0.30569524032276923 Report-NewSubEvent 0.05973813420621931 Report-Official 0.1371007371007371 Report-OriginalEvent 0.04454685099846391 Report-ServiceAvailable 0.351575456053068 Report-ThirdPartyObservation 0.19955127359113106 Report-Weather 0.5173819583425988 Request-GoodsServices 0.08677685950413222 Request-InformationWanted 0.2623762376237624 Request-SearchAndRescue 0.11857707509881422
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)
perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")
for eventId in eventIdentifiers:
tavgPrecision = 0.0
tavgRecall = 0.0
tavgF1 = 0.0
tavgAccuracy = 0.0
categoryCount = 0
for categoryId in informationTypes2Index.keys():
if sum(event2groundtruth[eventId].get(categoryId)) == 0:
continue
categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
tavgPrecision = tavgPrecision + categoryPrecision
tavgRecall = tavgRecall + categoryRecall
tavgF1 = tavgF1 + categoryF1
tavgAccuracy = tavgAccuracy + categoryAccuracy
categoryCount += 1
if categoryCount == 0:
print("No categories for event:", eventId)
continue
print(eventId)
print(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
print(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
print(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
print(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
print("")
perEventFile.write(eventId+"\n")
perEventFile.write(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
perEventFile.write("\n")
perEventFile.write("\n")
2020_01_27_houston_explosion.2020 Information Type Precision (positive class, multi-type, macro): 0.204055537406687 Information Type Recall (positive class, multi-type, macro): 0.3210144716177857 Information Type F1 (positive class, multi-type, macro): 0.18936419659538598 Information Type Accuracy (overall, multi-type, macro): 0.8869565217391304 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Information Type Precision (positive class, multi-type, macro): 0.49828467432272816 Information Type Recall (positive class, multi-type, macro): 0.4532983008399848 Information Type F1 (positive class, multi-type, macro): 0.4467533682971848 Information Type Accuracy (overall, multi-type, macro): 0.8548654244306418 2020_02_10_mideast_tornadoes.day2_al.2020 Information Type Precision (positive class, multi-type, macro): 0.23451920821025266 Information Type Recall (positive class, multi-type, macro): 0.37318032667847345 Information Type F1 (positive class, multi-type, macro): 0.24924511371677371 Information Type Accuracy (overall, multi-type, macro): 0.9045637922339888 2020_02_10_mideast_tornadoes.day3_md.2019 Information Type Precision (positive class, multi-type, macro): 0.14021368058070272 Information Type Recall (positive class, multi-type, macro): 0.3578818927650236 Information Type F1 (positive class, multi-type, macro): 0.14866488671948233 Information Type Accuracy (overall, multi-type, macro): 0.8597727272727271 2020_05_06_tn_derecho.2020 Information Type Precision (positive class, multi-type, macro): 0.262083412087746 Information Type Recall (positive class, multi-type, macro): 0.28743792607418245 Information Type F1 (positive class, multi-type, macro): 0.22830622456979008 Information Type Accuracy (overall, multi-type, macro): 0.8869730123997084 brooklynblockparty_shooting.2019 Information Type Precision (positive class, multi-type, macro): 0.1849548140674396 Information Type Recall (positive class, multi-type, macro): 0.5215523539288426 Information Type F1 (positive class, multi-type, macro): 0.1994132458779106 Information Type Accuracy (overall, multi-type, macro): 0.904967387846007
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2016_puttingal_temple Information Type Precision (positive class, multi-type, macro): 0.19200162589055284 Information Type Recall (positive class, multi-type, macro): 0.17840512683604223 Information Type F1 (positive class, multi-type, macro): 0.15080794483820356 Information Type Accuracy (overall, multi-type, macro): 0.8997964228588046 2017_12_04_thomas_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.29053720105892455 Information Type Recall (positive class, multi-type, macro): 0.26377421260195727 Information Type F1 (positive class, multi-type, macro): 0.2647530233269965 Information Type Accuracy (overall, multi-type, macro): 0.8708711493872141 2017_12_07_lilac_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.3315237301602523 Information Type Recall (positive class, multi-type, macro): 0.30947258218054563 Information Type F1 (positive class, multi-type, macro): 0.28202537154286006 Information Type Accuracy (overall, multi-type, macro): 0.8773770491803279 2018_07_23_klamathon_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.37602443169909827 Information Type Recall (positive class, multi-type, macro): 0.28500461329767446 Information Type F1 (positive class, multi-type, macro): 0.287830172745737 Information Type Accuracy (overall, multi-type, macro): 0.8758823367848608 2018_08_05_holy_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.1442742018538389 Information Type Recall (positive class, multi-type, macro): 0.315235194655036 Information Type F1 (positive class, multi-type, macro): 0.1604442001757343 Information Type Accuracy (overall, multi-type, macro): 0.9248314737331474 2018_11_07_Woolsey_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.14994282044555063 Information Type Recall (positive class, multi-type, macro): 0.18077980347727104 Information Type F1 (positive class, multi-type, macro): 0.14409704852065874 Information Type Accuracy (overall, multi-type, macro): 0.889176235302231 2018_maryland_flood Information Type Precision (positive class, multi-type, macro): 0.2890599571849312 Information Type Recall (positive class, multi-type, macro): 0.35828142264177787 Information Type F1 (positive class, multi-type, macro): 0.2889363265039347 Information Type Accuracy (overall, multi-type, macro): 0.870989255279733 2018_pittsburgh_synagogue_shooting Information Type Precision (positive class, multi-type, macro): 0.38269796573291887 Information Type Recall (positive class, multi-type, macro): 0.37861119945077965 Information Type F1 (positive class, multi-type, macro): 0.36519659440551944 Information Type Accuracy (overall, multi-type, macro): 0.7585470085470085
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_03_01_alberta_wildfire.2019.v2 Information Type Precision (positive class, multi-type, macro): 0.09654626897370239 Information Type Recall (positive class, multi-type, macro): 0.21004749711822604 Information Type F1 (positive class, multi-type, macro): 0.06646235226323287 Information Type Accuracy (overall, multi-type, macro): 0.8445305770887166 2019_08_25_hurricane_dorian.2019 Information Type Precision (positive class, multi-type, macro): 0.24359516376996557 Information Type Recall (positive class, multi-type, macro): 0.21756261164054613 Information Type F1 (positive class, multi-type, macro): 0.1583621699056024 Information Type Accuracy (overall, multi-type, macro): 0.872609776304888
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_10_10_saddleridge_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.2402787803843353 Information Type Recall (positive class, multi-type, macro): 0.19828408163863132 Information Type F1 (positive class, multi-type, macro): 0.20236040475521797 Information Type Accuracy (overall, multi-type, macro): 0.9177093108122198 2019_10_25_kincade_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.30803828740069045 Information Type Recall (positive class, multi-type, macro): 0.32259806643518274 Information Type F1 (positive class, multi-type, macro): 0.2922787310396954 Information Type Accuracy (overall, multi-type, macro): 0.9021026626406808 2019_durham_gas_explosion Information Type Precision (positive class, multi-type, macro): 0.2708436360893993 Information Type Recall (positive class, multi-type, macro): 0.3422003180855385 Information Type F1 (positive class, multi-type, macro): 0.2645028770611925 Information Type Accuracy (overall, multi-type, macro): 0.8879892246717861
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_saugus_high_school_shooting Information Type Precision (positive class, multi-type, macro): 0.21224345239548537 Information Type Recall (positive class, multi-type, macro): 0.23083768041753483 Information Type F1 (positive class, multi-type, macro): 0.19356833564227258 Information Type Accuracy (overall, multi-type, macro): 0.8977586351091413 2019_townsville_flood Information Type Precision (positive class, multi-type, macro): 0.31108754584759996 Information Type Recall (positive class, multi-type, macro): 0.27241188523193804 Information Type F1 (positive class, multi-type, macro): 0.26379223905651566 Information Type Accuracy (overall, multi-type, macro): 0.8858574016838962
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2020_easter_tornado_outbreak Information Type Precision (positive class, multi-type, macro): 0.16676434206145527 Information Type Recall (positive class, multi-type, macro): 0.3398823874232521 Information Type F1 (positive class, multi-type, macro): 0.19246864981825695 Information Type Accuracy (overall, multi-type, macro): 0.8839572806614605 2020_tornado_outbreak_of_april Information Type Precision (positive class, multi-type, macro): 0.2798747139319698 Information Type Recall (positive class, multi-type, macro): 0.3218971770439659 Information Type F1 (positive class, multi-type, macro): 0.25272213678559763 Information Type Accuracy (overall, multi-type, macro): 0.8819058754750074
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2020_tornado_outbreak_of_march Information Type Precision (positive class, multi-type, macro): 0.18580102459752004 Information Type Recall (positive class, multi-type, macro): 0.379194089171432 Information Type F1 (positive class, multi-type, macro): 0.1810617461388319 Information Type Accuracy (overall, multi-type, macro): 0.848917244665643 2020_visakhapatnam_gas_leak Information Type Precision (positive class, multi-type, macro): 0.2977215302267494 Information Type Recall (positive class, multi-type, macro): 0.1792907700024519 Information Type F1 (positive class, multi-type, macro): 0.17834116430123884 Information Type Accuracy (overall, multi-type, macro): 0.8529924002533248 tornado_outbreak_of_november_30_december_2018 Information Type Precision (positive class, multi-type, macro): 0.24095391722934387 Information Type Recall (positive class, multi-type, macro): 0.38005881972560546 Information Type F1 (positive class, multi-type, macro): 0.22856278723094206 Information Type Accuracy (overall, multi-type, macro): 0.9002740881298755
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)
N = len(eventIdentifiers)
ind = np.arange(N)
scoresPerEventF1 = []
for eventId in eventIdentifiers:
avgF1_ = 0.0
for categoryId in informationTypes2Index.keys():
avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerEventF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)
from sklearn.metrics import mean_squared_error
priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
priorityAvgf1 = priorityAvgf1 + f1;
if any(categoryId in s for s in highImportCategories):
priorityAvgf1High = priorityAvgf1High + f1
else:
priorityAvgf1Low = priorityAvgf1Low + f1
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")
Priority Label Prediction (F1, macro): 0.22878279179769898
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases
priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
if categoryId == "Other-Irrelevant":
continue
groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
predictedPriorities = category2PredictedPriorityScore[categoryId]
# Pathological case when no variation exists in the predictions needs to be handled
this_corr = 0.0
if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
priorityAvgCorr = priorityAvgCorr + this_corr
if any(categoryId in s for s in highImportCategories):
priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
else:
priorityAvgCorrLow = priorityAvgCorrLow + this_corr
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")
Priority Score Prediction (Pearson): 0.12757159682294006 Priority Score Prediction, High (Pearson): 0.10584417475436199 Priority Score Prediction, Low (Pearson): 0.13481407084579938
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)
N = len(informationTypes2Index)
ind = np.arange(N)
priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
if (math.isnan(priorityCatF1)):
priorityCatF1 = 0.0
categoryLabels.append(categoryId)
priorityCatF1Values.append(priorityCatF1);
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, priorityCatF1Values, width)
plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
resultLine = None
# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")
resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
" & "+
str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',avgF1/numInformationTypes)+
" & "+
str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
" & "+
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
" \\\\")
print(runName+" & "+resultLine)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")
Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\ njit_label_prop & 0.4215 & 0.2008 & 0.2600 & 0.8964 & 0.2268 & 0.2288 & 0.1058 & 0.1225 \\
91
# Done
resultsFile.close()
perTopicFile.close()
perEventFile.close()
# header = [
# "Run",
# "date",
# "team",
# "description",
# "paper",
# "code",
# "nDCG@100",
# "Info-Type F1 [Actionable]",
# "Info-Type F1 [All]",
# "Info-Type Accuracy",
# "Priority F1 [Actionable]",
# "Priority F1 [All]",
# "Priority R [Actionable]",
# "Priority R [All]",
# ]
import csv
if os.path.isfile("metadata.json"):
this_cwd = os.getcwd()
sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
leaderboard_entry = None
with open("metadata.json", "r") as in_file:
metadata = json.load(in_file)
leaderboard_entry = [
runName,
sub_date,
metadata["organization"].lower(),
metadata["model_description"],
metadata["paper"] if metadata["paper"].startswith("http") else "",
metadata["code"] if metadata["code"].startswith("http") else "",
str.format('{0:.4f}',system_ndcg_micro),
str.format('{0:.4f}',avgF1High/numHighInformationTypes),
str.format('{0:.4f}',avgF1/numInformationTypes),
str.format('{0:.4f}',avgAccuracy/numInformationTypes),
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
]
with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
leader_writer = csv.writer(csvResultsFile)
leader_writer.writerow(leaderboard_entry)