# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"
import os
cwd = os.getcwd()
# Configuration Information
# Do we try and normalize the run priority scores?
enablePriorityNorm = True
# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5
taskCategories = [
"CallToAction-Donations",
"CallToAction-MovePeople",
"CallToAction-Volunteer",
"Other-Advice",
"Other-ContextualInformation",
"Other-Discussion",
"Other-Irrelevant",
"Other-Sentiment",
"Report-CleanUp",
"Report-EmergingThreats",
"Report-Factoid",
"Report-FirstPartyObservation",
"Report-Hashtags",
"Report-Location",
"Report-MultimediaShare",
"Report-News",
"Report-NewSubEvent",
"Report-Official",
"Report-OriginalEvent",
"Report-ServiceAvailable",
"Report-ThirdPartyObservation",
"Report-Weather",
"Request-GoodsServices",
"Request-InformationWanted",
"Request-SearchAndRescue",
]
# What we consider to be highly important categories of information
highImportCategories = [
"Request-GoodsServices",
"Request-SearchAndRescue",
"CallToAction-MovePeople",
"Report-EmergingThreats",
"Report-NewSubEvent",
"Report-ServiceAvailable"
]
highImportCategoriesShort = [
"GoodsServices",
"SearchAndRescue",
"MovePeople",
"EmergingThreats",
"NewSubEvent",
"ServiceAvailable"
]
# Priority map
priorityScoreMap = {
"Critical": 1.0,
"High": 0.75,
"Medium": 0.5,
"Low": 0.25,
"Unknown": 0.25,
}
# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected
# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
# '2015_09_28_hurricane_joaquin.2015',
# '2017_03_23_cyclone_debbie.2017',
# '2018_02_24_anticyclone_hartmut.2018',
# '2018_07_13_ferguson_wildfire.2018',
# '2018_07_23_cranston_wildfire.2018',
# '2018_09_07_hurricane_florence.2018',
# '2018_10_07_hurricane_michael.2018',
# '2019_09_17_tropicalstorm_imelda.2019',
# '2019_karnataka_floods',
# '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
# '2020_01_28_bar_shooting_nc.2020',
# '2020_02_07_rutherford_tn_floods.2020',
# '2020_05_26_edenville_dam_failure.2020.corrected',
# '2020_08_27_hurricane_laura.2020',
# '2020_09_11_hurricane_sally.2020',
# '2020_afghanistan_flood',
# '2020_hpakant_jade_mine_disaster',
# '2020_kerala_floods',
# 'T2020_02_03_texas_university_shooting.2020',
# 'UNASSIGNED',
# 'indonesia_earthquake.2019'
"2020_05_26_edenville_dam_failure.2020.corrected",
"2018_10_07_hurricane_michael.2018",
"2020_01_28_bar_shooting_nc.2020",
"T2020_02_03_texas_university_shooting.2020",
"2020_02_07_rutherford_tn_floods.2020",
"UNASSIGNED",
"indonesia_earthquake.2019",
"2015_09_28_hurricane_joaquin.2015",
"2017_03_23_cyclone_debbie.2017",
"2018_02_24_anticyclone_hartmut.2018",
"2018_07_13_ferguson_wildfire.2018",
"2018_07_23_cranston_wildfire.2018",
"2018_09_07_hurricane_florence.2018",
"2019_09_17_tropicalstorm_imelda.2019",
"2019_karnataka_floods",
"2019_spring_floods_in_ontario_quebec_and_new_brunswick",
"2020_08_27_hurricane_laura.2020",
"2020_09_11_hurricane_sally.2020",
"2020_afghanistan_flood",
"2020_hpakant_jade_mine_disaster",
"2020_kerala_floods",
]
import glob
runFile = None
for f in glob.glob("*.gz"):
runFile = f
print("Run File:", f)
Run File: njit_eda.run.json.gz
import gzip
import json
runName = None
with gzip.open(runFile, "r") as inRunFile:
for line in inRunFile:
line = line.decode("utf8")
# runName = line.rpartition("\t")[2].strip()
runName = json.loads(line)["runtag"]
break
print("Run Name:", runName)
Run Name: njit_eda
# Do we try and normalize the run priority scores?
enablePriorityNorm = False
dataDir = "../../data/2021b"
# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir
# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
# "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
# "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
# "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
"%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]
# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir
topicArray = []
with open(topicsFile, "r") as inTopicsFile:
topicNum = None
topicDataset = None
for line_ in inTopicsFile:
line = line_.strip()
if line == "</top>":
if topicDataset in skipEvents:
continue
topicArray.append((topicDataset, topicNum))
if line.startswith("<num>"):
topicNum = line.partition("<num>")[2].partition("</num>")[0]
if line.startswith("<dataset>"):
topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
for row in topicArray:
print(row)
('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076') ('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080') ('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081') ('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082') ('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083') ('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085') ('2016_puttingal_temple', 'TRECIS-CTIT-H-089') ('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091') ('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092') ('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096') ('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097') ('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100') ('2018_maryland_flood', 'TRECIS-CTIT-H-101') ('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102') ('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103') ('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104') ('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106') ('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107') ('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108') ('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110') ('2019_townsville_flood', 'TRECIS-CTIT-H-112') ('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116') ('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119') ('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120') ('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121') ('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')
# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())
resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")
perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")
perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")
1
# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
# --------------------------------------------------
# Stage 1: Load the ground truth dataset
# --------------------------------------------------
groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
print("Reading "+groundtruthFile)
with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:
groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])
# --------------------------------------------------
# Stage 2: Load run file
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
# runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])
Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json
# --------------------------------------------------
# Stage 3: Load the categories
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:
ontologyJSON = json.load(ontologyJSONFile)
informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index
for informationTypeJSON in ontologyJSON["informationTypes"]:
informationTypeId = informationTypeJSON["id"]
informationTypeIndex = taskCategories.index(informationTypeId)
informationTypes2Index[informationTypeId] = informationTypeIndex
informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex
# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
# not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events
tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []
invertedPriorityScoreMap = {
v:k for k,v in priorityScoreMap.items()
}
tweetIndex = 0
for groundtruth in groundtruthJSON:
for eventJSON in groundtruth["events"]:
eventid = eventJSON["eventid"]
print(eventid)
if eventid in skipEvents:
continue
if not event2tweetIds.get(eventid):
event2tweetIds[eventid] = []
if any(eventid in s for s in eventIdentifiers):
# iterate over tweets in the event
for tweetJSON in eventJSON["tweets"]:
tweetid = tweetJSON["postID"]
categories = tweetJSON["postCategories"]
priority = tweetJSON["postPriority"]
if priority == "High" or priority == "Critical":
countHighCriticalImport = countHighCriticalImport + 1
if priority == "Low" or priority == "Medium":
countLowMediumImport = countLowMediumImport + 1
# check categories for name issues and correct if possible
cleanedCategories = []
highImportCats = []
lowImportCats = []
for categoryId in categories:
if not any(categoryId in s for s in informationTypesShort2Index.keys()):
# print("Found unknown category in ground truth "+categoryId+", ignoring...")
pass
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategoriesShort):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
if tweetid not in tweetsSeen:
event2tweetIds[eventid].append(tweetid)
tweetId2TRECInfoCategories[tweetid] = cleanedCategories
tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
tweetId2TRECPriorityCategory[tweetid] = priority
index2TweetId[tweetIndex] = tweetid;
tweetIndex = tweetIndex + 1
tweetsSeen.append(tweetid)
else:
tweetId2TRECInfoCategories[tweetid] = list(set(
cleanedCategories + tweetId2TRECInfoCategories[tweetid]
))
prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
thisPriorityScore = priorityScoreMap[priority]
tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
max(prePriorityScore, thisPriorityScore)
]
else:
print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")
2020_01_27_houston_explosion.2020 2020_01_28_bar_shooting_nc.2020 T2020_02_03_texas_university_shooting.2020 2020_02_07_rutherford_tn_floods.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 2020_05_26_edenville_dam_failure.2020.corrected brooklynblockparty_shooting.2019 UNASSIGNED indonesia_earthquake.2019 2015_09_28_hurricane_joaquin.2015 2016_puttingal_temple 2017_03_23_cyclone_debbie.2017 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_02_24_anticyclone_hartmut.2018 2018_07_13_ferguson_wildfire.2018 2018_07_23_cranston_wildfire.2018 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_09_07_hurricane_florence.2018 2018_10_07_hurricane_michael.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_09_17_tropicalstorm_imelda.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_karnataka_floods 2019_saugus_high_school_shooting 2019_spring_floods_in_ontario_quebec_and_new_brunswick 2019_townsville_flood 2020_08_27_hurricane_laura.2020 2020_09_11_hurricane_sally.2020 2020_afghanistan_flood 2020_easter_tornado_outbreak 2020_hpakant_jade_mine_disaster 2020_kerala_floods 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)
maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999
for predictionParts in runContents:
#print(runLine)
if (len(predictionParts)<6 ):
print(runLine)
continue
else:
eventId = predictionParts["topic"]
if eventId in skipEvents:
continue
tweetId = predictionParts["tweet_id"]
rank = 0
#print(predictionParts[5])
category_scores = predictionParts["info_type_scores"]
category_labels = predictionParts["info_type_labels"]
priority = float(predictionParts["priority"])
if priority > maxPrediction:
maxPrediction = priority
if priority < minPrediction:
minPrediction = priority
cleanedCategories = []
cleanedCategoriesProbs = []
highImportCats = []
lowImportCats = []
# Handle category flags
for catIndex, categoryLabel in enumerate(category_labels):
# check if we have a binary flag for this label
if categoryLabel == 0:
# False flag, so skip
continue
categoryId = taskCategories[catIndex]
if not any(categoryId in s for s in informationTypes2Index.keys()):
print("Found unknown category in run "+categoryId+", ignoring...")
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategories):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
# Process category probabilities
for categoryProbability in category_scores:
if categoryProbability > maxCategory:
maxCategory = categoryProbability
if categoryProbability < minCategory:
minCategory = categoryProbability
cleanedCategoriesProbs.append(categoryProbability)
tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
tweetId2RunInfoCategories[tweetId] = cleanedCategories
tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
tweetId2RunPriorityScore[tweetId] = priority
if priority > priorityScoreMap["High"]:
tweetId2RunPriorityCategory[tweetId] = "Critical"
elif priority > priorityScoreMap["Medium"]:
tweetId2RunPriorityCategory[tweetId] = "High"
elif priority > priorityScoreMap["Low"]:
tweetId2RunPriorityCategory[tweetId] = "Medium"
else:
tweetId2RunPriorityCategory[tweetId] = "Low"
if not event2TweetIdRank.get(eventId):
event2TweetIdRank[eventId] = []
rankTuple = (tweetId,rank)
event2TweetIdRank.get(eventId).append(rankTuple)
for eventId in event2TweetIdRank.keys():
tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
event2TweetIdRank[eventId] = tweetsSorted
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunPriorityScore.get(tweetId):
if enablePriorityNorm:
if (minPrediction-minPrediction) == 0.0:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
else:
tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
else:
tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
else:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------
category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
#pprint(categories)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
category2GroundTruth[categoryId] = categoryVector
#pprint(category2GroundTruth)
# --------------------------------------------------
# Stage 7: Create run vectors per category
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
# no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunInfoCategories.get(tweetId):
categories = tweetId2RunInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
else:
categoryVector.append(0)
category2Predicted[categoryId] = categoryVector
#pprint(category2Predicted)
# --------------------------------------------------
# Stage 8: Make event category vectors
# --------------------------------------------------
event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(eventId)
for tweetId in event2tweetIds.get(eventId):
# print(tweetId)
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2groundtruth[eventId] = eventCategories
event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
print(eventId)
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(tweetId)
for tweetId in event2tweetIds.get(eventId):
#print(tweetId)
categories = tweetId2RunInfoCategories.get(tweetId)
if categories == None:
categories = json.loads("[]")
tweetId2RunInfoCategories[tweetId] = categories
if any(categoryId in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2prediction[eventId] = eventCategories
2020_01_27_houston_explosion.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 brooklynblockparty_shooting.2019 2016_puttingal_temple 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_saugus_high_school_shooting 2019_townsville_flood 2020_easter_tornado_outbreak 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------
category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
priorityVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
priority = tweetId2TRECPriorityCategory.get(tweetId)
priorityVector.append(priority)
category2GroundTruthPriority[categoryId] = priorityVector
category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
categoryScoreVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
if tweetId2RunPriorityCategory.get(tweetId):
priority = tweetId2RunPriorityCategory.get(tweetId)
priorityScore = tweetId2RunPriorityScore.get(tweetId)
categoryVector.append(priority)
categoryScoreVector.append(priorityScore)
else:
categoryVector.append("Low") # default to low priority
categoryScoreVector.append(0.25)
category2PredictedPriority[categoryId] = categoryVector
category2PredictedPriorityScore[categoryId] = categoryScoreVector
# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories
# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------
import pandas as pd
def calc_dcg(scores, at_k=100):
position = 1
accumulator = 0.0
for score in scores[:at_k]:
numerator = 2 ** score - 1
denom = np.log2(position + 1)
accumulator += numerator / denom
position += 1
return accumulator
priority_map = {
"Unknown": 1,
"Low": 1,
"Medium": 2,
"High": 3,
"Critical": 4,
}
at_k = 100
tweetId2TRECPriorityCategory_score = {
k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
tweetId2TRECPriorityCategory_score.values(),
reverse=True
)
best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print(event)
tweetId2TRECPriorityCategory_scores_sorted = sorted(
[tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
reverse=True
)
ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
print("\tBest DCG:", ideal_dcg)
best_dcg_per_event[event] = ideal_dcg
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()
# Code below calculates the DCG for a system's
# ranked priority tweets. We have to do some
# sampling here to break ties among tweets with
# the same priority scores.
# Build a dataframe from the system's provided
# priority scores, so we can identify what the
# top-most priorities are and get a count of
# the number of tweets in each priority bin.
priority_df = pd.DataFrame(
[(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
columns=["tweet_id", "priority"]
)
# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print("Event:", event)
local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
unique_scores = local_priority_df["priority"].value_counts()
# Find the top priority scores that would be included
# in the necessary at_k values.
total = 0
top_keys = []
candidates = {}
for top in sorted(unique_scores.index, reverse=True):
# We store this key, so we can go back and shuffle
#. tweets with this score.
top_keys.append(top)
local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
candidates[top] = list(local_restricted_df["tweet_id"])
total += local_restricted_df.shape[0]
# Once we have enough samples, stop.
if ( total > at_k ):
break
# Now we generate distribution over the DCG for this
# system and do this a number of times to remove
# dependence on our selection of the top k tweets
random_dcgs = []
for i in range(100):
local_tweet_ids = []
for top in top_keys:
this_top_tweets = candidates[top][:]
np.random.shuffle(this_top_tweets)
needed = at_k - len(local_tweet_ids)
local_tweet_ids.extend(this_top_tweets[:needed])
local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]
random_dcgs.append(calc_dcg(local_scores))
system_dcg = np.mean(random_dcgs)
system_ndcg_ = system_dcg / best_dcg_per_event[event]
print("\tnDCG:", system_ndcg_)
system_dcg_per_event[event] = system_ndcg_
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")
2020_01_27_houston_explosion.2020 Best DCG: 176.99559032459564 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Best DCG: 268.88459894996123 2020_02_10_mideast_tornadoes.day2_al.2020 Best DCG: 270.1716952398847 2020_02_10_mideast_tornadoes.day3_md.2019 Best DCG: 135.38775246204446 2020_05_06_tn_derecho.2020 Best DCG: 167.06354661312534 brooklynblockparty_shooting.2019 Best DCG: 179.1756130795261 2016_puttingal_temple Best DCG: 314.08006311421406 2017_12_04_thomas_wildfire.2017 Best DCG: 300.71399384300895 2017_12_07_lilac_wildfire.2017 Best DCG: 314.08006311421406 2018_07_23_klamathon_wildfire.2018 Best DCG: 221.46334445469358 2018_08_05_holy_wildfire.2018 Best DCG: 153.96993418707177 2018_11_07_Woolsey_wildfire.2018 Best DCG: 175.67469323453255 2018_maryland_flood Best DCG: 285.7119531591263 2018_pittsburgh_synagogue_shooting Best DCG: 111.85075929877581 2019_03_01_alberta_wildfire.2019.v2 Best DCG: 62.88708564345522 2019_08_25_hurricane_dorian.2019 Best DCG: 146.57069611996656 2019_10_10_saddleridge_wildfire.2019 Best DCG: 173.00802656786584 2019_10_25_kincade_wildfire.2019 Best DCG: 314.08006311421406 2019_durham_gas_explosion Best DCG: 201.07148118577902 2019_saugus_high_school_shooting Best DCG: 314.08006311421406 2019_townsville_flood Best DCG: 314.08006311421406 2020_easter_tornado_outbreak Best DCG: 214.9714167256293 2020_tornado_outbreak_of_april Best DCG: 314.08006311421406 2020_tornado_outbreak_of_march Best DCG: 267.51977363880474 2020_visakhapatnam_gas_leak Best DCG: 314.08006311421406 tornado_outbreak_of_november_30_december_2018 Best DCG: 314.08006311421406 Mean: 231.7589407554446 Event: 2020_01_27_houston_explosion.2020 nDCG: 0.29005257046674615 Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020 nDCG: 0.40453786026469896 Event: 2020_02_10_mideast_tornadoes.day2_al.2020 nDCG: 0.42454008868724413 Event: 2020_02_10_mideast_tornadoes.day3_md.2019 nDCG: 0.3503809173364061 Event: 2020_05_06_tn_derecho.2020 nDCG: 0.49573100995162817 Event: brooklynblockparty_shooting.2019 nDCG: 0.20840263200494905 Event: 2016_puttingal_temple nDCG: 0.2682399215341739 Event: 2017_12_04_thomas_wildfire.2017 nDCG: 0.36642091039871755 Event: 2017_12_07_lilac_wildfire.2017 nDCG: 0.3566907001037119 Event: 2018_07_23_klamathon_wildfire.2018 nDCG: 0.5092323707809754 Event: 2018_08_05_holy_wildfire.2018 nDCG: 0.4208574484409764 Event: 2018_11_07_Woolsey_wildfire.2018 nDCG: 0.37095026927289926 Event: 2018_maryland_flood nDCG: 0.3380890635337886 Event: 2018_pittsburgh_synagogue_shooting nDCG: 0.7673198416559154 Event: 2019_03_01_alberta_wildfire.2019.v2 nDCG: 0.3390670146526324 Event: 2019_08_25_hurricane_dorian.2019 nDCG: 0.3685825665419098 Event: 2019_10_10_saddleridge_wildfire.2019 nDCG: 0.4974540161288353 Event: 2019_10_25_kincade_wildfire.2019 nDCG: 0.5157052474829267 Event: 2019_durham_gas_explosion nDCG: 0.23998528736555355 Event: 2019_saugus_high_school_shooting nDCG: 0.2534407702492516 Event: 2019_townsville_flood nDCG: 0.6558293420957289 Event: 2020_easter_tornado_outbreak nDCG: 0.471146353192228 Event: 2020_tornado_outbreak_of_april nDCG: 0.5564689795069926 Event: 2020_tornado_outbreak_of_march nDCG: 0.26613186063574096 Event: 2020_visakhapatnam_gas_leak nDCG: 0.5668152174865976 Event: tornado_outbreak_of_november_30_december_2018 nDCG: 0.8575450922259352 System Event-Micro nDCG: 0.4292160519998909
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0
avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0
avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0
for categoryId in informationTypes2Index.keys():
categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
avgPrecision = avgPrecision + categoryPrecision
avgRecall = avgRecall + categoryRecall
avgF1 = avgF1 + categoryF1
avgAccuracy = avgAccuracy + categoryAccuracy
if any(categoryId in s for s in highImportCategories):
avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
avgRecallHigh = avgRecallHigh + categoryRecall
avgF1High = avgF1High + categoryF1
avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
else:
avgPrecisionLow = avgPrecisionLow + categoryPrecision
avgRecallLow = avgRecallLow + categoryRecall
avgF1Low = avgF1Low + categoryF1
avgAccuracyLow = avgAccuracyLow + categoryAccuracy
numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))
print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))
print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")
Information Type Precision (positive class, multi-type, macro): 0.25491126850073526 Information Type Recall (positive class, multi-type, macro): 0.347624807728532 Information Type F1 (positive class, multi-type, macro): 0.27354681671717984 Information Type Accuracy (overall, multi-type, macro): 0.8891585337463961 High Importance Information Type Precision (positive class, multi-type, macro): 0.22367040281298378 High Importance Information Type Recall (positive class, multi-type, macro): 0.33538138096280545 High Importance Information Type F1 (positive class, multi-type, macro): 0.2530826420118999 High Importance Information Type Accuracy (overall, multi-type, macro): 0.9577147837090892 Low Importance Information Type Precision (positive class, multi-type, macro): 0.26477680503370943 Low Importance Information Type Recall (positive class, multi-type, macro): 0.35149115302297196 Low Importance Information Type F1 (positive class, multi-type, macro): 0.28000918767674204 Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8675091916529143
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here,
# primaraly with respect to recall, as we want the user to
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)
from sklearn.metrics import classification_report
perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")
for categoryId in informationTypes2Index.keys():
target_names = ['Other Classes', categoryId]
try:
print(categoryId)
print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))
perTopicFile.write(categoryId+"\n")
perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
perTopicFile.write(""+"\n")
except ValueError:
print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")
CallToAction-Donations precision recall f1-score support Other Classes 1.00 0.99 0.99 55275 CallToAction-Donations 0.28 0.52 0.36 568 accuracy 0.98 55843 macro avg 0.64 0.76 0.68 55843 weighted avg 0.99 0.98 0.98 55843 CallToAction-MovePeople precision recall f1-score support Other Classes 0.99 0.98 0.98 54646 CallToAction-MovePeople 0.34 0.44 0.38 1197 accuracy 0.97 55843 macro avg 0.66 0.71 0.68 55843 weighted avg 0.97 0.97 0.97 55843 CallToAction-Volunteer precision recall f1-score support Other Classes 1.00 0.99 0.99 55543 CallToAction-Volunteer 0.17 0.29 0.22 300 accuracy 0.99 55843 macro avg 0.59 0.64 0.61 55843 weighted avg 0.99 0.99 0.99 55843 Other-Advice precision recall f1-score support Other Classes 0.96 0.96 0.96 52602 Other-Advice 0.37 0.34 0.35 3241 accuracy 0.93 55843 macro avg 0.66 0.65 0.66 55843 weighted avg 0.92 0.93 0.93 55843 Other-ContextualInformation precision recall f1-score support Other Classes 0.97 0.95 0.96 54346 Other-ContextualInformation 0.05 0.08 0.06 1497 accuracy 0.93 55843 macro avg 0.51 0.52 0.51 55843 weighted avg 0.95 0.93 0.94 55843 Other-Discussion precision recall f1-score support Other Classes 0.99 0.96 0.98 55263 Other-Discussion 0.03 0.12 0.05 580 accuracy 0.95 55843 macro avg 0.51 0.54 0.51 55843 weighted avg 0.98 0.95 0.97 55843 Other-Irrelevant precision recall f1-score support Other Classes 0.53 0.87 0.66 23267 Other-Irrelevant 0.83 0.46 0.59 32576 accuracy 0.63 55843 macro avg 0.68 0.66 0.63 55843 weighted avg 0.71 0.63 0.62 55843 Other-Sentiment precision recall f1-score support Other Classes 0.94 0.95 0.94 51270 Other-Sentiment 0.32 0.26 0.29 4573 accuracy 0.89 55843 macro avg 0.63 0.61 0.61 55843 weighted avg 0.88 0.89 0.89 55843 Report-CleanUp precision recall f1-score support Other Classes 1.00 0.98 0.99 55581 Report-CleanUp 0.07 0.35 0.12 262 accuracy 0.98 55843 macro avg 0.54 0.67 0.56 55843 weighted avg 0.99 0.98 0.98 55843 Report-EmergingThreats precision recall f1-score support Other Classes 0.96 0.92 0.94 52454 Report-EmergingThreats 0.23 0.40 0.30 3389 accuracy 0.88 55843 macro avg 0.60 0.66 0.62 55843 weighted avg 0.92 0.88 0.90 55843 Report-Factoid precision recall f1-score support Other Classes 0.93 0.95 0.94 49844 Report-Factoid 0.51 0.43 0.47 5999 accuracy 0.90 55843 macro avg 0.72 0.69 0.70 55843 weighted avg 0.89 0.90 0.89 55843 Report-FirstPartyObservation precision recall f1-score support Other Classes 0.97 0.91 0.94 54135 Report-FirstPartyObservation 0.08 0.23 0.11 1708 accuracy 0.89 55843 macro avg 0.52 0.57 0.53 55843 weighted avg 0.95 0.89 0.92 55843 Report-Hashtags precision recall f1-score support Other Classes 0.88 0.89 0.89 48407 Report-Hashtags 0.25 0.24 0.25 7436 accuracy 0.80 55843 macro avg 0.57 0.57 0.57 55843 weighted avg 0.80 0.80 0.80 55843 Report-Location precision recall f1-score support Other Classes 0.84 0.71 0.77 41325 Report-Location 0.42 0.61 0.50 14518 accuracy 0.68 55843 macro avg 0.63 0.66 0.63 55843 weighted avg 0.73 0.68 0.70 55843 Report-MultimediaShare precision recall f1-score support Other Classes 0.92 0.71 0.80 48784 Report-MultimediaShare 0.23 0.60 0.33 7059 accuracy 0.70 55843 macro avg 0.58 0.65 0.57 55843 weighted avg 0.84 0.70 0.74 55843 Report-News precision recall f1-score support Other Classes 0.94 0.80 0.86 50324 Report-News 0.22 0.53 0.31 5519 accuracy 0.77 55843 macro avg 0.58 0.66 0.59 55843 weighted avg 0.87 0.77 0.81 55843 Report-NewSubEvent precision recall f1-score support Other Classes 0.98 0.98 0.98 54728 Report-NewSubEvent 0.06 0.08 0.07 1115 accuracy 0.96 55843 macro avg 0.52 0.53 0.53 55843 weighted avg 0.96 0.96 0.96 55843 Report-Official precision recall f1-score support Other Classes 0.96 0.97 0.96 53203 Report-Official 0.20 0.16 0.18 2640 accuracy 0.93 55843 macro avg 0.58 0.56 0.57 55843 weighted avg 0.92 0.93 0.93 55843 Report-OriginalEvent precision recall f1-score support Other Classes 0.95 0.98 0.96 52838 Report-OriginalEvent 0.13 0.06 0.08 3005 accuracy 0.93 55843 macro avg 0.54 0.52 0.52 55843 weighted avg 0.90 0.93 0.91 55843 Report-ServiceAvailable precision recall f1-score support Other Classes 0.98 0.98 0.98 53834 Report-ServiceAvailable 0.43 0.36 0.39 2009 accuracy 0.96 55843 macro avg 0.70 0.67 0.69 55843 weighted avg 0.96 0.96 0.96 55843 Report-ThirdPartyObservation precision recall f1-score support Other Classes 0.92 0.77 0.84 50379 Report-ThirdPartyObservation 0.16 0.40 0.23 5464 accuracy 0.74 55843 macro avg 0.54 0.59 0.54 55843 weighted avg 0.85 0.74 0.78 55843 Report-Weather precision recall f1-score support Other Classes 0.97 0.90 0.93 50824 Report-Weather 0.41 0.69 0.51 5019 accuracy 0.88 55843 macro avg 0.69 0.79 0.72 55843 weighted avg 0.92 0.88 0.90 55843 Request-GoodsServices precision recall f1-score support Other Classes 1.00 0.99 0.99 55452 Request-GoodsServices 0.20 0.35 0.26 391 accuracy 0.99 55843 macro avg 0.60 0.67 0.62 55843 weighted avg 0.99 0.99 0.99 55843 Request-InformationWanted precision recall f1-score support Other Classes 0.99 0.99 0.99 55241 Request-InformationWanted 0.29 0.32 0.31 602 accuracy 0.98 55843 macro avg 0.64 0.66 0.65 55843 weighted avg 0.99 0.98 0.98 55843 Request-SearchAndRescue precision recall f1-score support Other Classes 1.00 0.99 0.99 55737 Request-SearchAndRescue 0.07 0.38 0.12 106 accuracy 0.99 55843 macro avg 0.53 0.68 0.56 55843 weighted avg 1.00 0.99 0.99 55843
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)
N = len(informationTypes2Index)
ind = np.arange(N)
scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
print(categoryId, localF1Score)
scoresPerCategoryF1.append(localF1Score)
categoryLabels.append(categoryId)
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerCategoryF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
CallToAction-Donations 0.36341463414634145 CallToAction-MovePeople 0.383941605839416 CallToAction-Volunteer 0.21689785624211852 Other-Advice 0.35181031720602374 Other-ContextualInformation 0.05889394302130717 Other-Discussion 0.04876273653566229 Other-Irrelevant 0.5884174175366172 Other-Sentiment 0.2873219202681671 Report-CleanUp 0.12332439678284184 Report-EmergingThreats 0.29506765604539503 Report-Factoid 0.4660583941605839 Report-FirstPartyObservation 0.11324599708879186 Report-Hashtags 0.24627986348122868 Report-Location 0.5003377617653683 Report-MultimediaShare 0.33192597561456816 Report-News 0.3149162283793309 Report-NewSubEvent 0.07335154116269996 Report-Official 0.1780821917808219 Report-OriginalEvent 0.08119755046495804 Report-ServiceAvailable 0.3933982683982684 Report-ThirdPartyObservation 0.23181722909243346 Report-Weather 0.5117175912192228 Request-GoodsServices 0.25679475164011245 Request-InformationWanted 0.30575256107171006 Request-SearchAndRescue 0.11594202898550725
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)
perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")
for eventId in eventIdentifiers:
tavgPrecision = 0.0
tavgRecall = 0.0
tavgF1 = 0.0
tavgAccuracy = 0.0
categoryCount = 0
for categoryId in informationTypes2Index.keys():
if sum(event2groundtruth[eventId].get(categoryId)) == 0:
continue
categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
tavgPrecision = tavgPrecision + categoryPrecision
tavgRecall = tavgRecall + categoryRecall
tavgF1 = tavgF1 + categoryF1
tavgAccuracy = tavgAccuracy + categoryAccuracy
categoryCount += 1
if categoryCount == 0:
print("No categories for event:", eventId)
continue
print(eventId)
print(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
print(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
print(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
print(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
print("")
perEventFile.write(eventId+"\n")
perEventFile.write(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
perEventFile.write("\n")
perEventFile.write("\n")
2020_01_27_houston_explosion.2020 Information Type Precision (positive class, multi-type, macro): 0.18513888294742592 Information Type Recall (positive class, multi-type, macro): 0.3834550621558499 Information Type F1 (positive class, multi-type, macro): 0.19931420353213505 Information Type Accuracy (overall, multi-type, macro): 0.8797542194913276 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Information Type Precision (positive class, multi-type, macro): 0.5411891483671135 Information Type Recall (positive class, multi-type, macro): 0.5774457082329235 Information Type F1 (positive class, multi-type, macro): 0.5097415643325799 Information Type Accuracy (overall, multi-type, macro): 0.8614906832298137
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2020_02_10_mideast_tornadoes.day2_al.2020 Information Type Precision (positive class, multi-type, macro): 0.23500787730378483 Information Type Recall (positive class, multi-type, macro): 0.42700241683595475 Information Type F1 (positive class, multi-type, macro): 0.26449656019410483 Information Type Accuracy (overall, multi-type, macro): 0.899394856278366 2020_02_10_mideast_tornadoes.day3_md.2019 Information Type Precision (positive class, multi-type, macro): 0.13857930395895096 Information Type Recall (positive class, multi-type, macro): 0.37027114148087337 Information Type F1 (positive class, multi-type, macro): 0.15356194405355675 Information Type Accuracy (overall, multi-type, macro): 0.8473863636363638 2020_05_06_tn_derecho.2020 Information Type Precision (positive class, multi-type, macro): 0.26468811971677747 Information Type Recall (positive class, multi-type, macro): 0.343992874849518 Information Type F1 (positive class, multi-type, macro): 0.24438887950056198 Information Type Accuracy (overall, multi-type, macro): 0.879664478482859 brooklynblockparty_shooting.2019 Information Type Precision (positive class, multi-type, macro): 0.17342430073647205 Information Type Recall (positive class, multi-type, macro): 0.3997746845072779 Information Type F1 (positive class, multi-type, macro): 0.17229523123735777 Information Type Accuracy (overall, multi-type, macro): 0.8964961819917276
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2016_puttingal_temple Information Type Precision (positive class, multi-type, macro): 0.17244351073887385 Information Type Recall (positive class, multi-type, macro): 0.22757966811037336 Information Type F1 (positive class, multi-type, macro): 0.15502143766466292 Information Type Accuracy (overall, multi-type, macro): 0.8835538752362949 2017_12_04_thomas_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.28254461879522913 Information Type Recall (positive class, multi-type, macro): 0.33401146563936995 Information Type F1 (positive class, multi-type, macro): 0.28920365454132496 Information Type Accuracy (overall, multi-type, macro): 0.8564027823782709 2017_12_07_lilac_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.3131131269469757 Information Type Recall (positive class, multi-type, macro): 0.3287368309542573 Information Type F1 (positive class, multi-type, macro): 0.286313844120366 Information Type Accuracy (overall, multi-type, macro): 0.8720448662640208 2018_07_23_klamathon_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.3600336728610447 Information Type Recall (positive class, multi-type, macro): 0.31569693756689426 Information Type F1 (positive class, multi-type, macro): 0.306401760650656 Information Type Accuracy (overall, multi-type, macro): 0.873987200966794 2018_08_05_holy_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.15897693745112373 Information Type Recall (positive class, multi-type, macro): 0.3648911354573208 Information Type F1 (positive class, multi-type, macro): 0.1788225097560688 Information Type Accuracy (overall, multi-type, macro): 0.9237854486285448 2018_11_07_Woolsey_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.1780649522133095 Information Type Recall (positive class, multi-type, macro): 0.24392956167847596 Information Type F1 (positive class, multi-type, macro): 0.1765581142247618 Information Type Accuracy (overall, multi-type, macro): 0.8848315575324988 2018_maryland_flood Information Type Precision (positive class, multi-type, macro): 0.2883744234690681 Information Type Recall (positive class, multi-type, macro): 0.38749693699341026 Information Type F1 (positive class, multi-type, macro): 0.2833169307628438 Information Type Accuracy (overall, multi-type, macro): 0.8661281956280104 2018_pittsburgh_synagogue_shooting Information Type Precision (positive class, multi-type, macro): 0.38843765310483575 Information Type Recall (positive class, multi-type, macro): 0.45263807490194297 Information Type F1 (positive class, multi-type, macro): 0.40133213918099037 Information Type Accuracy (overall, multi-type, macro): 0.7564102564102564 2019_03_01_alberta_wildfire.2019.v2 Information Type Precision (positive class, multi-type, macro): 0.09781124365997199 Information Type Recall (positive class, multi-type, macro): 0.276921967402279 Information Type F1 (positive class, multi-type, macro): 0.06609112097242385 Information Type Accuracy (overall, multi-type, macro): 0.8566674496907055
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_08_25_hurricane_dorian.2019 Information Type Precision (positive class, multi-type, macro): 0.2520869113052685 Information Type Recall (positive class, multi-type, macro): 0.1959163024090545 Information Type F1 (positive class, multi-type, macro): 0.17250475632043816 Information Type Accuracy (overall, multi-type, macro): 0.8666777133388566 2019_10_10_saddleridge_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.2753040341066915 Information Type Recall (positive class, multi-type, macro): 0.2961506343398142 Information Type F1 (positive class, multi-type, macro): 0.2607276112771609 Information Type Accuracy (overall, multi-type, macro): 0.9108799916588467 2019_10_25_kincade_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.29676100816290196 Information Type Recall (positive class, multi-type, macro): 0.38255829517173007 Information Type F1 (positive class, multi-type, macro): 0.30639589796501016 Information Type Accuracy (overall, multi-type, macro): 0.897633818281636 2019_durham_gas_explosion Information Type Precision (positive class, multi-type, macro): 0.27751939574997614 Information Type Recall (positive class, multi-type, macro): 0.38525628115622335 Information Type F1 (positive class, multi-type, macro): 0.28478870025721814 Information Type Accuracy (overall, multi-type, macro): 0.8784679217240526 2019_saugus_high_school_shooting Information Type Precision (positive class, multi-type, macro): 0.21055095848140706 Information Type Recall (positive class, multi-type, macro): 0.2766370106973435 Information Type F1 (positive class, multi-type, macro): 0.1978716834045378 Information Type Accuracy (overall, multi-type, macro): 0.8917161495358369 2019_townsville_flood Information Type Precision (positive class, multi-type, macro): 0.3023158845393163 Information Type Recall (positive class, multi-type, macro): 0.29692046315656984 Information Type F1 (positive class, multi-type, macro): 0.2612763835370968 Information Type Accuracy (overall, multi-type, macro): 0.8797292976659917 2020_easter_tornado_outbreak Information Type Precision (positive class, multi-type, macro): 0.16518400648364584 Information Type Recall (positive class, multi-type, macro): 0.4460603691434141 Information Type F1 (positive class, multi-type, macro): 0.20068320899483696 Information Type Accuracy (overall, multi-type, macro): 0.8703873832491196 2020_tornado_outbreak_of_april Information Type Precision (positive class, multi-type, macro): 0.2946712029169531 Information Type Recall (positive class, multi-type, macro): 0.35878945780267224 Information Type F1 (positive class, multi-type, macro): 0.28663309458657116 Information Type Accuracy (overall, multi-type, macro): 0.8807600116924877 2020_tornado_outbreak_of_march Information Type Precision (positive class, multi-type, macro): 0.19749685572088235 Information Type Recall (positive class, multi-type, macro): 0.4233489310221285 Information Type F1 (positive class, multi-type, macro): 0.19669576501466618 Information Type Accuracy (overall, multi-type, macro): 0.8338539736326575 2020_visakhapatnam_gas_leak Information Type Precision (positive class, multi-type, macro): 0.3010416851206346 Information Type Recall (positive class, multi-type, macro): 0.19493864273812334 Information Type F1 (positive class, multi-type, macro): 0.17831735542184549 Information Type Accuracy (overall, multi-type, macro): 0.8415004222081489 tornado_outbreak_of_november_30_december_2018 Information Type Precision (positive class, multi-type, macro): 0.23841657784024803 Information Type Recall (positive class, multi-type, macro): 0.41982038341541056 Information Type F1 (positive class, multi-type, macro): 0.24122009788728138 Information Type Accuracy (overall, multi-type, macro): 0.8947923255323635
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)
N = len(eventIdentifiers)
ind = np.arange(N)
scoresPerEventF1 = []
for eventId in eventIdentifiers:
avgF1_ = 0.0
for categoryId in informationTypes2Index.keys():
avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerEventF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)
from sklearn.metrics import mean_squared_error
priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
priorityAvgf1 = priorityAvgf1 + f1;
if any(categoryId in s for s in highImportCategories):
priorityAvgf1High = priorityAvgf1High + f1
else:
priorityAvgf1Low = priorityAvgf1Low + f1
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")
Priority Label Prediction (F1, macro): 0.15305280823614692
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases
priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
if categoryId == "Other-Irrelevant":
continue
groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
predictedPriorities = category2PredictedPriorityScore[categoryId]
# Pathological case when no variation exists in the predictions needs to be handled
this_corr = 0.0
if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
priorityAvgCorr = priorityAvgCorr + this_corr
if any(categoryId in s for s in highImportCategories):
priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
else:
priorityAvgCorrLow = priorityAvgCorrLow + this_corr
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")
Priority Score Prediction (Pearson): 0.23525391362081105 Priority Score Prediction, High (Pearson): 0.1552800278773464 Priority Score Prediction, Low (Pearson): 0.2619118755352992
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)
N = len(informationTypes2Index)
ind = np.arange(N)
priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
if (math.isnan(priorityCatF1)):
priorityCatF1 = 0.0
categoryLabels.append(categoryId)
priorityCatF1Values.append(priorityCatF1);
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, priorityCatF1Values, width)
plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
resultLine = None
# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")
resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
" & "+
str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',avgF1/numInformationTypes)+
" & "+
str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
" & "+
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
" \\\\")
print(runName+" & "+resultLine)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")
Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\ njit_eda & 0.4292 & 0.2531 & 0.2735 & 0.8892 & 0.1647 & 0.1531 & 0.1553 & 0.2258 \\
84
# Done
resultsFile.close()
perTopicFile.close()
perEventFile.close()
# header = [
# "Run",
# "date",
# "team",
# "description",
# "paper",
# "code",
# "nDCG@100",
# "Info-Type F1 [Actionable]",
# "Info-Type F1 [All]",
# "Info-Type Accuracy",
# "Priority F1 [Actionable]",
# "Priority F1 [All]",
# "Priority R [Actionable]",
# "Priority R [All]",
# ]
import csv
if os.path.isfile("metadata.json"):
this_cwd = os.getcwd()
sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
leaderboard_entry = None
with open("metadata.json", "r") as in_file:
metadata = json.load(in_file)
leaderboard_entry = [
runName,
sub_date,
metadata["organization"].lower(),
metadata["model_description"],
metadata["paper"] if metadata["paper"].startswith("http") else "",
metadata["code"] if metadata["code"].startswith("http") else "",
str.format('{0:.4f}',system_ndcg_micro),
str.format('{0:.4f}',avgF1High/numHighInformationTypes),
str.format('{0:.4f}',avgF1/numInformationTypes),
str.format('{0:.4f}',avgAccuracy/numInformationTypes),
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
]
with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
leader_writer = csv.writer(csvResultsFile)
leader_writer.writerow(leaderboard_entry)