# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"
import os
cwd = os.getcwd()
# Configuration Information
# Do we try and normalize the run priority scores?
enablePriorityNorm = True
# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5
taskCategories = [
"CallToAction-Donations",
"CallToAction-MovePeople",
"CallToAction-Volunteer",
"Other-Advice",
"Other-ContextualInformation",
"Other-Discussion",
"Other-Irrelevant",
"Other-Sentiment",
"Report-CleanUp",
"Report-EmergingThreats",
"Report-Factoid",
"Report-FirstPartyObservation",
"Report-Hashtags",
"Report-Location",
"Report-MultimediaShare",
"Report-News",
"Report-NewSubEvent",
"Report-Official",
"Report-OriginalEvent",
"Report-ServiceAvailable",
"Report-ThirdPartyObservation",
"Report-Weather",
"Request-GoodsServices",
"Request-InformationWanted",
"Request-SearchAndRescue",
]
# What we consider to be highly important categories of information
highImportCategories = [
"Request-GoodsServices",
"Request-SearchAndRescue",
"CallToAction-MovePeople",
"Report-EmergingThreats",
"Report-NewSubEvent",
"Report-ServiceAvailable"
]
highImportCategoriesShort = [
"GoodsServices",
"SearchAndRescue",
"MovePeople",
"EmergingThreats",
"NewSubEvent",
"ServiceAvailable"
]
# Priority map
priorityScoreMap = {
"Critical": 1.0,
"High": 0.75,
"Medium": 0.5,
"Low": 0.25,
"Unknown": 0.25,
}
# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected
# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
# '2015_09_28_hurricane_joaquin.2015',
# '2017_03_23_cyclone_debbie.2017',
# '2018_02_24_anticyclone_hartmut.2018',
# '2018_07_13_ferguson_wildfire.2018',
# '2018_07_23_cranston_wildfire.2018',
# '2018_09_07_hurricane_florence.2018',
# '2018_10_07_hurricane_michael.2018',
# '2019_09_17_tropicalstorm_imelda.2019',
# '2019_karnataka_floods',
# '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
# '2020_01_28_bar_shooting_nc.2020',
# '2020_02_07_rutherford_tn_floods.2020',
# '2020_05_26_edenville_dam_failure.2020.corrected',
# '2020_08_27_hurricane_laura.2020',
# '2020_09_11_hurricane_sally.2020',
# '2020_afghanistan_flood',
# '2020_hpakant_jade_mine_disaster',
# '2020_kerala_floods',
# 'T2020_02_03_texas_university_shooting.2020',
# 'UNASSIGNED',
# 'indonesia_earthquake.2019'
"2020_05_26_edenville_dam_failure.2020.corrected",
"2018_10_07_hurricane_michael.2018",
"2020_01_28_bar_shooting_nc.2020",
"T2020_02_03_texas_university_shooting.2020",
"2020_02_07_rutherford_tn_floods.2020",
"UNASSIGNED",
"indonesia_earthquake.2019",
"2015_09_28_hurricane_joaquin.2015",
"2017_03_23_cyclone_debbie.2017",
"2018_02_24_anticyclone_hartmut.2018",
"2018_07_13_ferguson_wildfire.2018",
"2018_07_23_cranston_wildfire.2018",
"2018_09_07_hurricane_florence.2018",
"2019_09_17_tropicalstorm_imelda.2019",
"2019_karnataka_floods",
"2019_spring_floods_in_ontario_quebec_and_new_brunswick",
"2020_08_27_hurricane_laura.2020",
"2020_09_11_hurricane_sally.2020",
"2020_afghanistan_flood",
"2020_hpakant_jade_mine_disaster",
"2020_kerala_floods",
]
import glob
runFile = None
for f in glob.glob("*.gz"):
runFile = f
print("Run File:", f)
Run File: run.json.gz
import gzip
import json
runName = None
with gzip.open(runFile, "r") as inRunFile:
for line in inRunFile:
line = line.decode("utf8")
# runName = line.rpartition("\t")[2].strip()
runName = json.loads(line)["runtag"]
break
print("Run Name:", runName)
Run Name: ens
# Do we try and normalize the run priority scores?
enablePriorityNorm = False
dataDir = "../../data/2021b"
# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir
# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
# "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
# "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
# "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
"%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]
# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir
topicArray = []
with open(topicsFile, "r") as inTopicsFile:
topicNum = None
topicDataset = None
for line_ in inTopicsFile:
line = line_.strip()
if line == "</top>":
if topicDataset in skipEvents:
continue
topicArray.append((topicDataset, topicNum))
if line.startswith("<num>"):
topicNum = line.partition("<num>")[2].partition("</num>")[0]
if line.startswith("<dataset>"):
topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
for row in topicArray:
print(row)
('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076') ('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080') ('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081') ('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082') ('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083') ('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085') ('2016_puttingal_temple', 'TRECIS-CTIT-H-089') ('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091') ('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092') ('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096') ('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097') ('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100') ('2018_maryland_flood', 'TRECIS-CTIT-H-101') ('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102') ('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103') ('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104') ('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106') ('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107') ('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108') ('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110') ('2019_townsville_flood', 'TRECIS-CTIT-H-112') ('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116') ('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119') ('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120') ('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121') ('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')
# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())
resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")
perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")
perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")
1
# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
# --------------------------------------------------
# Stage 1: Load the ground truth dataset
# --------------------------------------------------
groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
print("Reading "+groundtruthFile)
with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:
groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])
# --------------------------------------------------
# Stage 2: Load run file
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
# runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])
Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json
# --------------------------------------------------
# Stage 3: Load the categories
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:
ontologyJSON = json.load(ontologyJSONFile)
informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index
for informationTypeJSON in ontologyJSON["informationTypes"]:
informationTypeId = informationTypeJSON["id"]
informationTypeIndex = taskCategories.index(informationTypeId)
informationTypes2Index[informationTypeId] = informationTypeIndex
informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex
# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
# not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events
tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []
invertedPriorityScoreMap = {
v:k for k,v in priorityScoreMap.items()
}
tweetIndex = 0
for groundtruth in groundtruthJSON:
for eventJSON in groundtruth["events"]:
eventid = eventJSON["eventid"]
print(eventid)
if eventid in skipEvents:
continue
if not event2tweetIds.get(eventid):
event2tweetIds[eventid] = []
if any(eventid in s for s in eventIdentifiers):
# iterate over tweets in the event
for tweetJSON in eventJSON["tweets"]:
tweetid = tweetJSON["postID"]
categories = tweetJSON["postCategories"]
priority = tweetJSON["postPriority"]
if priority == "High" or priority == "Critical":
countHighCriticalImport = countHighCriticalImport + 1
if priority == "Low" or priority == "Medium":
countLowMediumImport = countLowMediumImport + 1
# check categories for name issues and correct if possible
cleanedCategories = []
highImportCats = []
lowImportCats = []
for categoryId in categories:
if not any(categoryId in s for s in informationTypesShort2Index.keys()):
# print("Found unknown category in ground truth "+categoryId+", ignoring...")
pass
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategoriesShort):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
if tweetid not in tweetsSeen:
event2tweetIds[eventid].append(tweetid)
tweetId2TRECInfoCategories[tweetid] = cleanedCategories
tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
tweetId2TRECPriorityCategory[tweetid] = priority
index2TweetId[tweetIndex] = tweetid;
tweetIndex = tweetIndex + 1
tweetsSeen.append(tweetid)
else:
tweetId2TRECInfoCategories[tweetid] = list(set(
cleanedCategories + tweetId2TRECInfoCategories[tweetid]
))
prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
thisPriorityScore = priorityScoreMap[priority]
tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
max(prePriorityScore, thisPriorityScore)
]
else:
print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")
2020_01_27_houston_explosion.2020 2020_01_28_bar_shooting_nc.2020 T2020_02_03_texas_university_shooting.2020 2020_02_07_rutherford_tn_floods.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 2020_05_26_edenville_dam_failure.2020.corrected brooklynblockparty_shooting.2019 UNASSIGNED indonesia_earthquake.2019 2015_09_28_hurricane_joaquin.2015 2016_puttingal_temple 2017_03_23_cyclone_debbie.2017 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_02_24_anticyclone_hartmut.2018 2018_07_13_ferguson_wildfire.2018 2018_07_23_cranston_wildfire.2018 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_09_07_hurricane_florence.2018 2018_10_07_hurricane_michael.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_09_17_tropicalstorm_imelda.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_karnataka_floods 2019_saugus_high_school_shooting 2019_spring_floods_in_ontario_quebec_and_new_brunswick 2019_townsville_flood 2020_08_27_hurricane_laura.2020 2020_09_11_hurricane_sally.2020 2020_afghanistan_flood 2020_easter_tornado_outbreak 2020_hpakant_jade_mine_disaster 2020_kerala_floods 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)
maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999
for predictionParts in runContents:
#print(runLine)
if (len(predictionParts)<6 ):
print(runLine)
continue
else:
eventId = predictionParts["topic"]
if eventId in skipEvents:
continue
tweetId = predictionParts["tweet_id"]
rank = 0
#print(predictionParts[5])
category_scores = predictionParts["info_type_scores"]
category_labels = predictionParts["info_type_labels"]
priority = float(predictionParts["priority"])
if priority > maxPrediction:
maxPrediction = priority
if priority < minPrediction:
minPrediction = priority
cleanedCategories = []
cleanedCategoriesProbs = []
highImportCats = []
lowImportCats = []
# Handle category flags
for catIndex, categoryLabel in enumerate(category_labels):
# check if we have a binary flag for this label
if categoryLabel == 0:
# False flag, so skip
continue
categoryId = taskCategories[catIndex]
if not any(categoryId in s for s in informationTypes2Index.keys()):
print("Found unknown category in run "+categoryId+", ignoring...")
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategories):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
# Process category probabilities
for categoryProbability in category_scores:
if categoryProbability > maxCategory:
maxCategory = categoryProbability
if categoryProbability < minCategory:
minCategory = categoryProbability
cleanedCategoriesProbs.append(categoryProbability)
tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
tweetId2RunInfoCategories[tweetId] = cleanedCategories
tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
tweetId2RunPriorityScore[tweetId] = priority
if priority > priorityScoreMap["High"]:
tweetId2RunPriorityCategory[tweetId] = "Critical"
elif priority > priorityScoreMap["Medium"]:
tweetId2RunPriorityCategory[tweetId] = "High"
elif priority > priorityScoreMap["Low"]:
tweetId2RunPriorityCategory[tweetId] = "Medium"
else:
tweetId2RunPriorityCategory[tweetId] = "Low"
if not event2TweetIdRank.get(eventId):
event2TweetIdRank[eventId] = []
rankTuple = (tweetId,rank)
event2TweetIdRank.get(eventId).append(rankTuple)
for eventId in event2TweetIdRank.keys():
tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
event2TweetIdRank[eventId] = tweetsSorted
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunPriorityScore.get(tweetId):
if enablePriorityNorm:
if (minPrediction-minPrediction) == 0.0:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
else:
tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
else:
tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
else:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------
category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
#pprint(categories)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
category2GroundTruth[categoryId] = categoryVector
#pprint(category2GroundTruth)
# --------------------------------------------------
# Stage 7: Create run vectors per category
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
# no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunInfoCategories.get(tweetId):
categories = tweetId2RunInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
else:
categoryVector.append(0)
category2Predicted[categoryId] = categoryVector
#pprint(category2Predicted)
# --------------------------------------------------
# Stage 8: Make event category vectors
# --------------------------------------------------
event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(eventId)
for tweetId in event2tweetIds.get(eventId):
# print(tweetId)
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2groundtruth[eventId] = eventCategories
event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
print(eventId)
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(tweetId)
for tweetId in event2tweetIds.get(eventId):
#print(tweetId)
categories = tweetId2RunInfoCategories.get(tweetId)
if categories == None:
categories = json.loads("[]")
tweetId2RunInfoCategories[tweetId] = categories
if any(categoryId in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2prediction[eventId] = eventCategories
2020_01_27_houston_explosion.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 brooklynblockparty_shooting.2019 2016_puttingal_temple 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_saugus_high_school_shooting 2019_townsville_flood 2020_easter_tornado_outbreak 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------
category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
priorityVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
priority = tweetId2TRECPriorityCategory.get(tweetId)
priorityVector.append(priority)
category2GroundTruthPriority[categoryId] = priorityVector
category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
categoryScoreVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
if tweetId2RunPriorityCategory.get(tweetId):
priority = tweetId2RunPriorityCategory.get(tweetId)
priorityScore = tweetId2RunPriorityScore.get(tweetId)
categoryVector.append(priority)
categoryScoreVector.append(priorityScore)
else:
categoryVector.append("Low") # default to low priority
categoryScoreVector.append(0.25)
category2PredictedPriority[categoryId] = categoryVector
category2PredictedPriorityScore[categoryId] = categoryScoreVector
# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories
# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------
import pandas as pd
def calc_dcg(scores, at_k=100):
position = 1
accumulator = 0.0
for score in scores[:at_k]:
numerator = 2 ** score - 1
denom = np.log2(position + 1)
accumulator += numerator / denom
position += 1
return accumulator
priority_map = {
"Unknown": 1,
"Low": 1,
"Medium": 2,
"High": 3,
"Critical": 4,
}
at_k = 100
tweetId2TRECPriorityCategory_score = {
k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
tweetId2TRECPriorityCategory_score.values(),
reverse=True
)
best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print(event)
tweetId2TRECPriorityCategory_scores_sorted = sorted(
[tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
reverse=True
)
ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
print("\tBest DCG:", ideal_dcg)
best_dcg_per_event[event] = ideal_dcg
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()
# Code below calculates the DCG for a system's
# ranked priority tweets. We have to do some
# sampling here to break ties among tweets with
# the same priority scores.
# Build a dataframe from the system's provided
# priority scores, so we can identify what the
# top-most priorities are and get a count of
# the number of tweets in each priority bin.
priority_df = pd.DataFrame(
[(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
columns=["tweet_id", "priority"]
)
# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print("Event:", event)
local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
unique_scores = local_priority_df["priority"].value_counts()
# Find the top priority scores that would be included
# in the necessary at_k values.
total = 0
top_keys = []
candidates = {}
for top in sorted(unique_scores.index, reverse=True):
# We store this key, so we can go back and shuffle
#. tweets with this score.
top_keys.append(top)
local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
candidates[top] = list(local_restricted_df["tweet_id"])
total += local_restricted_df.shape[0]
# Once we have enough samples, stop.
if ( total > at_k ):
break
# Now we generate distribution over the DCG for this
# system and do this a number of times to remove
# dependence on our selection of the top k tweets
random_dcgs = []
for i in range(100):
local_tweet_ids = []
for top in top_keys:
this_top_tweets = candidates[top][:]
np.random.shuffle(this_top_tweets)
needed = at_k - len(local_tweet_ids)
local_tweet_ids.extend(this_top_tweets[:needed])
local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]
random_dcgs.append(calc_dcg(local_scores))
system_dcg = np.mean(random_dcgs)
system_ndcg_ = system_dcg / best_dcg_per_event[event]
print("\tnDCG:", system_ndcg_)
system_dcg_per_event[event] = system_ndcg_
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")
2020_01_27_houston_explosion.2020 Best DCG: 176.99559032459564 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Best DCG: 268.88459894996123 2020_02_10_mideast_tornadoes.day2_al.2020 Best DCG: 270.1716952398847 2020_02_10_mideast_tornadoes.day3_md.2019 Best DCG: 135.38775246204446 2020_05_06_tn_derecho.2020 Best DCG: 167.06354661312534 brooklynblockparty_shooting.2019 Best DCG: 179.1756130795261 2016_puttingal_temple Best DCG: 314.08006311421406 2017_12_04_thomas_wildfire.2017 Best DCG: 300.71399384300895 2017_12_07_lilac_wildfire.2017 Best DCG: 314.08006311421406 2018_07_23_klamathon_wildfire.2018 Best DCG: 221.46334445469358 2018_08_05_holy_wildfire.2018 Best DCG: 153.96993418707177 2018_11_07_Woolsey_wildfire.2018 Best DCG: 175.67469323453255 2018_maryland_flood Best DCG: 285.7119531591263 2018_pittsburgh_synagogue_shooting Best DCG: 111.85075929877581 2019_03_01_alberta_wildfire.2019.v2 Best DCG: 62.88708564345522 2019_08_25_hurricane_dorian.2019 Best DCG: 146.57069611996656 2019_10_10_saddleridge_wildfire.2019 Best DCG: 173.00802656786584 2019_10_25_kincade_wildfire.2019 Best DCG: 314.08006311421406 2019_durham_gas_explosion Best DCG: 201.07148118577902 2019_saugus_high_school_shooting Best DCG: 314.08006311421406 2019_townsville_flood Best DCG: 314.08006311421406 2020_easter_tornado_outbreak Best DCG: 214.9714167256293 2020_tornado_outbreak_of_april Best DCG: 314.08006311421406 2020_tornado_outbreak_of_march Best DCG: 267.51977363880474 2020_visakhapatnam_gas_leak Best DCG: 314.08006311421406 tornado_outbreak_of_november_30_december_2018 Best DCG: 314.08006311421406 Mean: 231.7589407554446 Event: 2020_01_27_houston_explosion.2020 nDCG: 0.24705265665210815 Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020 nDCG: 0.41181498699377 Event: 2020_02_10_mideast_tornadoes.day2_al.2020 nDCG: 0.4635021433763508 Event: 2020_02_10_mideast_tornadoes.day3_md.2019 nDCG: 0.35573799327240435 Event: 2020_05_06_tn_derecho.2020 nDCG: 0.5905942210943509 Event: brooklynblockparty_shooting.2019 nDCG: 0.14455035294463595 Event: 2016_puttingal_temple nDCG: 0.29964101255295456 Event: 2017_12_04_thomas_wildfire.2017 nDCG: 0.40168381636144224 Event: 2017_12_07_lilac_wildfire.2017 nDCG: 0.37507821937955427 Event: 2018_07_23_klamathon_wildfire.2018 nDCG: 0.5582142123214363 Event: 2018_08_05_holy_wildfire.2018 nDCG: 0.41724774420245725 Event: 2018_11_07_Woolsey_wildfire.2018 nDCG: 0.35425413701660324 Event: 2018_maryland_flood nDCG: 0.3567802480754506 Event: 2018_pittsburgh_synagogue_shooting nDCG: 0.9038488929651307 Event: 2019_03_01_alberta_wildfire.2019.v2 nDCG: 0.332956610407976 Event: 2019_08_25_hurricane_dorian.2019 nDCG: 0.38732858346211035 Event: 2019_10_10_saddleridge_wildfire.2019 nDCG: 0.5684541510465869 Event: 2019_10_25_kincade_wildfire.2019 nDCG: 0.590088706667267 Event: 2019_durham_gas_explosion nDCG: 0.22110283869962846 Event: 2019_saugus_high_school_shooting nDCG: 0.4479446140589797 Event: 2019_townsville_flood nDCG: 0.6733553385045296 Event: 2020_easter_tornado_outbreak nDCG: 0.5258689964592105 Event: 2020_tornado_outbreak_of_april nDCG: 0.6473238895392965 Event: 2020_tornado_outbreak_of_march nDCG: 0.34489922509514587 Event: 2020_visakhapatnam_gas_leak nDCG: 0.4643018131832162 Event: tornado_outbreak_of_november_30_december_2018 nDCG: 0.9209722784791589 System Event-Micro nDCG: 0.46171529549275986
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0
avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0
avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0
for categoryId in informationTypes2Index.keys():
categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
avgPrecision = avgPrecision + categoryPrecision
avgRecall = avgRecall + categoryRecall
avgF1 = avgF1 + categoryF1
avgAccuracy = avgAccuracy + categoryAccuracy
if any(categoryId in s for s in highImportCategories):
avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
avgRecallHigh = avgRecallHigh + categoryRecall
avgF1High = avgF1High + categoryF1
avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
else:
avgPrecisionLow = avgPrecisionLow + categoryPrecision
avgRecallLow = avgRecallLow + categoryRecall
avgF1Low = avgF1Low + categoryF1
avgAccuracyLow = avgAccuracyLow + categoryAccuracy
numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))
print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))
print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")
Information Type Precision (positive class, multi-type, macro): 0.24369271856320984 Information Type Recall (positive class, multi-type, macro): 0.4298112854341195 Information Type F1 (positive class, multi-type, macro): 0.2923145682196217 Information Type Accuracy (overall, multi-type, macro): 0.8685443117311035 High Importance Information Type Precision (positive class, multi-type, macro): 0.2473093410887588 High Importance Information Type Recall (positive class, multi-type, macro): 0.33132986454368046 High Importance Information Type F1 (positive class, multi-type, macro): 0.2670083076252519 High Importance Information Type Accuracy (overall, multi-type, macro): 0.9537900900739573 Low Importance Information Type Precision (positive class, multi-type, macro): 0.2425506272393523 Low Importance Information Type Recall (positive class, multi-type, macro): 0.46091068150478465 Low Importance Information Type F1 (positive class, multi-type, macro): 0.30030601893363323 Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8416245922544124
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here,
# primaraly with respect to recall, as we want the user to
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)
from sklearn.metrics import classification_report
perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")
for categoryId in informationTypes2Index.keys():
target_names = ['Other Classes', categoryId]
try:
print(categoryId)
print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))
perTopicFile.write(categoryId+"\n")
perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
perTopicFile.write(""+"\n")
except ValueError:
print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")
CallToAction-Donations precision recall f1-score support Other Classes 1.00 0.98 0.99 55275 CallToAction-Donations 0.26 0.66 0.37 568 accuracy 0.98 55843 macro avg 0.63 0.82 0.68 55843 weighted avg 0.99 0.98 0.98 55843 CallToAction-MovePeople precision recall f1-score support Other Classes 0.98 0.99 0.99 54646 CallToAction-MovePeople 0.46 0.31 0.37 1197 accuracy 0.98 55843 macro avg 0.72 0.65 0.68 55843 weighted avg 0.97 0.98 0.98 55843 CallToAction-Volunteer precision recall f1-score support Other Classes 1.00 0.99 0.99 55543 CallToAction-Volunteer 0.21 0.37 0.27 300 accuracy 0.99 55843 macro avg 0.60 0.68 0.63 55843 weighted avg 0.99 0.99 0.99 55843 Other-Advice precision recall f1-score support Other Classes 0.97 0.94 0.95 52602 Other-Advice 0.33 0.44 0.38 3241 accuracy 0.92 55843 macro avg 0.65 0.69 0.67 55843 weighted avg 0.93 0.92 0.92 55843 Other-ContextualInformation precision recall f1-score support Other Classes 0.97 0.95 0.96 54346 Other-ContextualInformation 0.03 0.07 0.05 1497 accuracy 0.93 55843 macro avg 0.50 0.51 0.50 55843 weighted avg 0.95 0.93 0.94 55843 Other-Discussion precision recall f1-score support Other Classes 0.99 0.89 0.94 55263 Other-Discussion 0.02 0.27 0.05 580 accuracy 0.88 55843 macro avg 0.51 0.58 0.49 55843 weighted avg 0.98 0.88 0.93 55843 Other-Irrelevant precision recall f1-score support Other Classes 0.57 0.80 0.66 23267 Other-Irrelevant 0.80 0.56 0.66 32576 accuracy 0.66 55843 macro avg 0.68 0.68 0.66 55843 weighted avg 0.70 0.66 0.66 55843 Other-Sentiment precision recall f1-score support Other Classes 0.95 0.86 0.91 51270 Other-Sentiment 0.25 0.51 0.34 4573 accuracy 0.83 55843 macro avg 0.60 0.69 0.62 55843 weighted avg 0.89 0.83 0.86 55843 Report-CleanUp precision recall f1-score support Other Classes 1.00 0.98 0.99 55581 Report-CleanUp 0.10 0.39 0.17 262 accuracy 0.98 55843 macro avg 0.55 0.69 0.58 55843 weighted avg 0.99 0.98 0.99 55843 Report-EmergingThreats precision recall f1-score support Other Classes 0.96 0.89 0.92 52454 Report-EmergingThreats 0.21 0.45 0.28 3389 accuracy 0.86 55843 macro avg 0.58 0.67 0.60 55843 weighted avg 0.92 0.86 0.89 55843 Report-Factoid precision recall f1-score support Other Classes 0.95 0.89 0.92 49844 Report-Factoid 0.39 0.59 0.47 5999 accuracy 0.86 55843 macro avg 0.67 0.74 0.69 55843 weighted avg 0.89 0.86 0.87 55843 Report-FirstPartyObservation precision recall f1-score support Other Classes 0.97 0.92 0.95 54135 Report-FirstPartyObservation 0.08 0.23 0.12 1708 accuracy 0.90 55843 macro avg 0.53 0.57 0.53 55843 weighted avg 0.95 0.90 0.92 55843 Report-Hashtags precision recall f1-score support Other Classes 0.92 0.74 0.82 48407 Report-Hashtags 0.26 0.59 0.36 7436 accuracy 0.72 55843 macro avg 0.59 0.67 0.59 55843 weighted avg 0.83 0.72 0.76 55843 Report-Location precision recall f1-score support Other Classes 0.86 0.56 0.68 41325 Report-Location 0.38 0.75 0.50 14518 accuracy 0.61 55843 macro avg 0.62 0.66 0.59 55843 weighted avg 0.74 0.61 0.64 55843 Report-MultimediaShare precision recall f1-score support Other Classes 0.93 0.64 0.76 48784 Report-MultimediaShare 0.21 0.65 0.32 7059 accuracy 0.64 55843 macro avg 0.57 0.65 0.54 55843 weighted avg 0.84 0.64 0.70 55843 Report-News precision recall f1-score support Other Classes 0.95 0.74 0.83 50324 Report-News 0.21 0.64 0.32 5519 accuracy 0.73 55843 macro avg 0.58 0.69 0.57 55843 weighted avg 0.88 0.73 0.78 55843 Report-NewSubEvent precision recall f1-score support Other Classes 0.98 0.96 0.97 54728 Report-NewSubEvent 0.06 0.12 0.08 1115 accuracy 0.95 55843 macro avg 0.52 0.54 0.53 55843 weighted avg 0.96 0.95 0.96 55843 Report-Official precision recall f1-score support Other Classes 0.96 0.96 0.96 53203 Report-Official 0.20 0.18 0.19 2640 accuracy 0.93 55843 macro avg 0.58 0.57 0.58 55843 weighted avg 0.92 0.93 0.92 55843 Report-OriginalEvent precision recall f1-score support Other Classes 0.95 0.96 0.95 52838 Report-OriginalEvent 0.09 0.07 0.08 3005 accuracy 0.91 55843 macro avg 0.52 0.52 0.52 55843 weighted avg 0.90 0.91 0.91 55843 Report-ServiceAvailable precision recall f1-score support Other Classes 0.98 0.97 0.97 53834 Report-ServiceAvailable 0.37 0.50 0.42 2009 accuracy 0.95 55843 macro avg 0.67 0.73 0.70 55843 weighted avg 0.96 0.95 0.95 55843 Report-ThirdPartyObservation precision recall f1-score support Other Classes 0.93 0.71 0.81 50379 Report-ThirdPartyObservation 0.15 0.48 0.23 5464 accuracy 0.69 55843 macro avg 0.54 0.60 0.52 55843 weighted avg 0.85 0.69 0.75 55843 Report-Weather precision recall f1-score support Other Classes 0.98 0.86 0.92 50824 Report-Weather 0.36 0.79 0.49 5019 accuracy 0.86 55843 macro avg 0.67 0.82 0.70 55843 weighted avg 0.92 0.86 0.88 55843 Request-GoodsServices precision recall f1-score support Other Classes 1.00 0.99 0.99 55452 Request-GoodsServices 0.28 0.30 0.29 391 accuracy 0.99 55843 macro avg 0.64 0.65 0.64 55843 weighted avg 0.99 0.99 0.99 55843 Request-InformationWanted precision recall f1-score support Other Classes 0.99 0.98 0.99 55241 Request-InformationWanted 0.27 0.51 0.35 602 accuracy 0.98 55843 macro avg 0.63 0.75 0.67 55843 weighted avg 0.99 0.98 0.98 55843 Request-SearchAndRescue precision recall f1-score support Other Classes 1.00 0.99 1.00 55737 Request-SearchAndRescue 0.10 0.32 0.16 106 accuracy 0.99 55843 macro avg 0.55 0.66 0.58 55843 weighted avg 1.00 0.99 1.00 55843
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)
N = len(informationTypes2Index)
ind = np.arange(N)
scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
print(categoryId, localF1Score)
scoresPerCategoryF1.append(localF1Score)
categoryLabels.append(categoryId)
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerCategoryF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
CallToAction-Donations 0.37132901941264307 CallToAction-MovePeople 0.36940110719677904 CallToAction-Volunteer 0.2668269230769231 Other-Advice 0.3787261982928431 Other-ContextualInformation 0.045829514207149404 Other-Discussion 0.04564991848228842 Other-Irrelevant 0.659798650452856 Other-Sentiment 0.33754486719310833 Report-CleanUp 0.16506410256410256 Report-EmergingThreats 0.2839459864966241 Report-Factoid 0.46656976744186046 Report-FirstPartyObservation 0.12065813528336382 Report-Hashtags 0.36028018129377837 Report-Location 0.5004152440712375 Report-MultimediaShare 0.3152376353728358 Report-News 0.31933905078353014 Report-NewSubEvent 0.0831758034026465 Report-Official 0.1890880441205436 Report-OriginalEvent 0.08060726549792156 Report-ServiceAvailable 0.42138630600169064 Report-ThirdPartyObservation 0.23370047965890922 Report-Weather 0.49418095357276937 Request-GoodsServices 0.2881773399014778 Request-InformationWanted 0.3549684089603676 Request-SearchAndRescue 0.15596330275229356
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)
perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")
for eventId in eventIdentifiers:
tavgPrecision = 0.0
tavgRecall = 0.0
tavgF1 = 0.0
tavgAccuracy = 0.0
categoryCount = 0
for categoryId in informationTypes2Index.keys():
if sum(event2groundtruth[eventId].get(categoryId)) == 0:
continue
categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
tavgPrecision = tavgPrecision + categoryPrecision
tavgRecall = tavgRecall + categoryRecall
tavgF1 = tavgF1 + categoryF1
tavgAccuracy = tavgAccuracy + categoryAccuracy
categoryCount += 1
if categoryCount == 0:
print("No categories for event:", eventId)
continue
print(eventId)
print(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
print(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
print(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
print(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
print("")
perEventFile.write(eventId+"\n")
perEventFile.write(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
perEventFile.write("\n")
perEventFile.write("\n")
2020_01_27_houston_explosion.2020 Information Type Precision (positive class, multi-type, macro): 0.18906570906832088 Information Type Recall (positive class, multi-type, macro): 0.41431186478276216 Information Type F1 (positive class, multi-type, macro): 0.20007615731841197 Information Type Accuracy (overall, multi-type, macro): 0.8641207124523607 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Information Type Precision (positive class, multi-type, macro): 0.4980527504231765 Information Type Recall (positive class, multi-type, macro): 0.5669158258733621 Information Type F1 (positive class, multi-type, macro): 0.49637216759451563 Information Type Accuracy (overall, multi-type, macro): 0.8583850931677018
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2020_02_10_mideast_tornadoes.day2_al.2020 Information Type Precision (positive class, multi-type, macro): 0.21715853791818426 Information Type Recall (positive class, multi-type, macro): 0.46853999618277214 Information Type F1 (positive class, multi-type, macro): 0.25221616643311434 Information Type Accuracy (overall, multi-type, macro): 0.8901758698940999 2020_02_10_mideast_tornadoes.day3_md.2019 Information Type Precision (positive class, multi-type, macro): 0.13667121077503622 Information Type Recall (positive class, multi-type, macro): 0.39924406549341135 Information Type F1 (positive class, multi-type, macro): 0.1593473222461694 Information Type Accuracy (overall, multi-type, macro): 0.8404545454545456 2020_05_06_tn_derecho.2020 Information Type Precision (positive class, multi-type, macro): 0.254035584297931 Information Type Recall (positive class, multi-type, macro): 0.3717208528274767 Information Type F1 (positive class, multi-type, macro): 0.2516933541540216 Information Type Accuracy (overall, multi-type, macro): 0.8723851203501094 brooklynblockparty_shooting.2019 Information Type Precision (positive class, multi-type, macro): 0.13140706423281348 Information Type Recall (positive class, multi-type, macro): 0.523210171864904 Information Type F1 (positive class, multi-type, macro): 0.14855776526727227 Information Type Accuracy (overall, multi-type, macro): 0.8576797645561565 2016_puttingal_temple Information Type Precision (positive class, multi-type, macro): 0.16730457731014606 Information Type Recall (positive class, multi-type, macro): 0.3523575596007596 Information Type F1 (positive class, multi-type, macro): 0.18071716752589534 Information Type Accuracy (overall, multi-type, macro): 0.8550094517958411 2017_12_04_thomas_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.28142013098305957 Information Type Recall (positive class, multi-type, macro): 0.4283335292065214 Information Type F1 (positive class, multi-type, macro): 0.31799595539560727 Information Type Accuracy (overall, multi-type, macro): 0.8331632991056642 2017_12_07_lilac_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.3214191183158574 Information Type Recall (positive class, multi-type, macro): 0.45893910956330997 Information Type F1 (positive class, multi-type, macro): 0.34377294905818445 Information Type Accuracy (overall, multi-type, macro): 0.846729939603106 2018_07_23_klamathon_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.34755115335548836 Information Type Recall (positive class, multi-type, macro): 0.46128757913890067 Information Type F1 (positive class, multi-type, macro): 0.348689384712254 Information Type Accuracy (overall, multi-type, macro): 0.8447636573374716 2018_08_05_holy_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.15176496180192312 Information Type Recall (positive class, multi-type, macro): 0.5028524233414902 Information Type F1 (positive class, multi-type, macro): 0.18010556462457053 Information Type Accuracy (overall, multi-type, macro): 0.8920269642026964 2018_11_07_Woolsey_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.1634253593130204 Information Type Recall (positive class, multi-type, macro): 0.33118681032365216 Information Type F1 (positive class, multi-type, macro): 0.1870395808692137 Information Type Accuracy (overall, multi-type, macro): 0.8567980414468467 2018_maryland_flood Information Type Precision (positive class, multi-type, macro): 0.2940747770542361 Information Type Recall (positive class, multi-type, macro): 0.4904045622118472 Information Type F1 (positive class, multi-type, macro): 0.33620375729361734 Information Type Accuracy (overall, multi-type, macro): 0.8567024824008895 2018_pittsburgh_synagogue_shooting Information Type Precision (positive class, multi-type, macro): 0.47606152375889216 Information Type Recall (positive class, multi-type, macro): 0.533813901130243 Information Type F1 (positive class, multi-type, macro): 0.4208606076413792 Information Type Accuracy (overall, multi-type, macro): 0.7521367521367521 2019_03_01_alberta_wildfire.2019.v2 Information Type Precision (positive class, multi-type, macro): 0.09588696664517639 Information Type Recall (positive class, multi-type, macro): 0.3781742129210757 Information Type F1 (positive class, multi-type, macro): 0.07771503739515583 Information Type Accuracy (overall, multi-type, macro): 0.8090595881293553
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_08_25_hurricane_dorian.2019 Information Type Precision (positive class, multi-type, macro): 0.23850813091385678 Information Type Recall (positive class, multi-type, macro): 0.3292052386036212 Information Type F1 (positive class, multi-type, macro): 0.22305240629786038 Information Type Accuracy (overall, multi-type, macro): 0.8437779618889807
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_10_10_saddleridge_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.2548569950523884 Information Type Recall (positive class, multi-type, macro): 0.4443337776629937 Information Type F1 (positive class, multi-type, macro): 0.28466342639620096 Information Type Accuracy (overall, multi-type, macro): 0.8854134084037119 2019_10_25_kincade_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.2816872728730423 Information Type Recall (positive class, multi-type, macro): 0.49914309551344394 Information Type F1 (positive class, multi-type, macro): 0.33512903879204087 Information Type Accuracy (overall, multi-type, macro): 0.8734559429041997 2019_durham_gas_explosion Information Type Precision (positive class, multi-type, macro): 0.23401551567907963 Information Type Recall (positive class, multi-type, macro): 0.46522526692525107 Information Type F1 (positive class, multi-type, macro): 0.2870242633216318 Information Type Accuracy (overall, multi-type, macro): 0.8580164726281893 2019_saugus_high_school_shooting Information Type Precision (positive class, multi-type, macro): 0.19995187124936853 Information Type Recall (positive class, multi-type, macro): 0.3536031150627514 Information Type F1 (positive class, multi-type, macro): 0.2297368056857633 Information Type Accuracy (overall, multi-type, macro): 0.880112068244543 2019_townsville_flood Information Type Precision (positive class, multi-type, macro): 0.3089657332795816 Information Type Recall (positive class, multi-type, macro): 0.44869635982138445 Information Type F1 (positive class, multi-type, macro): 0.3231828861692738 Information Type Accuracy (overall, multi-type, macro): 0.8674198017691569 2020_easter_tornado_outbreak Information Type Precision (positive class, multi-type, macro): 0.1501071514205996 Information Type Recall (positive class, multi-type, macro): 0.485594776367891 Information Type F1 (positive class, multi-type, macro): 0.18798202703605815 Information Type Accuracy (overall, multi-type, macro): 0.8397642015005359 2020_tornado_outbreak_of_april Information Type Precision (positive class, multi-type, macro): 0.26145533978956415 Information Type Recall (positive class, multi-type, macro): 0.45054781645477116 Information Type F1 (positive class, multi-type, macro): 0.2792072453241433 Information Type Accuracy (overall, multi-type, macro): 0.8508506284712071 2020_tornado_outbreak_of_march Information Type Precision (positive class, multi-type, macro): 0.1797162665078718 Information Type Recall (positive class, multi-type, macro): 0.4510845422577032 Information Type F1 (positive class, multi-type, macro): 0.1969310547875892 Information Type Accuracy (overall, multi-type, macro): 0.8232646794091173
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2020_visakhapatnam_gas_leak Information Type Precision (positive class, multi-type, macro): 0.2950390566472158 Information Type Recall (positive class, multi-type, macro): 0.2435544536319234 Information Type F1 (positive class, multi-type, macro): 0.2208546764956771 Information Type Accuracy (overall, multi-type, macro): 0.8413157061431286 tornado_outbreak_of_november_30_december_2018 Information Type Precision (positive class, multi-type, macro): 0.17693214818173578 Information Type Recall (positive class, multi-type, macro): 0.5263029817517529 Information Type F1 (positive class, multi-type, macro): 0.22346950382195596 Information Type Accuracy (overall, multi-type, macro): 0.845751633986928
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)
N = len(eventIdentifiers)
ind = np.arange(N)
scoresPerEventF1 = []
for eventId in eventIdentifiers:
avgF1_ = 0.0
for categoryId in informationTypes2Index.keys():
avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerEventF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)
from sklearn.metrics import mean_squared_error
priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
priorityAvgf1 = priorityAvgf1 + f1;
if any(categoryId in s for s in highImportCategories):
priorityAvgf1High = priorityAvgf1High + f1
else:
priorityAvgf1Low = priorityAvgf1Low + f1
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")
Priority Label Prediction (F1, macro): 0.26227911153135275
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases
priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
if categoryId == "Other-Irrelevant":
continue
groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
predictedPriorities = category2PredictedPriorityScore[categoryId]
# Pathological case when no variation exists in the predictions needs to be handled
this_corr = 0.0
if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
priorityAvgCorr = priorityAvgCorr + this_corr
if any(categoryId in s for s in highImportCategories):
priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
else:
priorityAvgCorrLow = priorityAvgCorrLow + this_corr
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")
Priority Score Prediction (Pearson): 0.331435585391442 Priority Score Prediction, High (Pearson): 0.2886311718647097 Priority Score Prediction, Low (Pearson): 0.3457037232336862
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)
N = len(informationTypes2Index)
ind = np.arange(N)
priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
if (math.isnan(priorityCatF1)):
priorityCatF1 = 0.0
categoryLabels.append(categoryId)
priorityCatF1Values.append(priorityCatF1);
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, priorityCatF1Values, width)
plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
resultLine = None
# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")
resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
" & "+
str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',avgF1/numInformationTypes)+
" & "+
str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
" & "+
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
" \\\\")
print(runName+" & "+resultLine)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")
Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\ ens & 0.4617 & 0.2670 & 0.2923 & 0.8685 & 0.2817 & 0.2623 & 0.2886 & 0.3182 \\
79
# Done
resultsFile.close()
perTopicFile.close()
perEventFile.close()
# header = [
# "Run",
# "date",
# "team",
# "description",
# "paper",
# "code",
# "nDCG@100",
# "Info-Type F1 [Actionable]",
# "Info-Type F1 [All]",
# "Info-Type Accuracy",
# "Priority F1 [Actionable]",
# "Priority F1 [All]",
# "Priority R [Actionable]",
# "Priority R [All]",
# ]
import csv
if os.path.isfile("metadata.json"):
this_cwd = os.getcwd()
sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
leaderboard_entry = None
with open("metadata.json", "r") as in_file:
metadata = json.load(in_file)
leaderboard_entry = [
runName,
sub_date,
metadata["organization"].lower(),
metadata["model_description"],
metadata["paper"] if metadata["paper"].startswith("http") else "",
metadata["code"] if metadata["code"].startswith("http") else "",
str.format('{0:.4f}',system_ndcg_micro),
str.format('{0:.4f}',avgF1High/numHighInformationTypes),
str.format('{0:.4f}',avgF1/numInformationTypes),
str.format('{0:.4f}',avgAccuracy/numInformationTypes),
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
]
with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
leader_writer = csv.writer(csvResultsFile)
leader_writer.writerow(leaderboard_entry)