# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"
import os
cwd = os.getcwd()
# Configuration Information
# Do we try and normalize the run priority scores?
enablePriorityNorm = True
# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5
taskCategories = [
"CallToAction-Donations",
"CallToAction-MovePeople",
"CallToAction-Volunteer",
"Other-Advice",
"Other-ContextualInformation",
"Other-Discussion",
"Other-Irrelevant",
"Other-Sentiment",
"Report-CleanUp",
"Report-EmergingThreats",
"Report-Factoid",
"Report-FirstPartyObservation",
"Report-Hashtags",
"Report-Location",
"Report-MultimediaShare",
"Report-News",
"Report-NewSubEvent",
"Report-Official",
"Report-OriginalEvent",
"Report-ServiceAvailable",
"Report-ThirdPartyObservation",
"Report-Weather",
"Request-GoodsServices",
"Request-InformationWanted",
"Request-SearchAndRescue",
]
# What we consider to be highly important categories of information
highImportCategories = [
"Request-GoodsServices",
"Request-SearchAndRescue",
"CallToAction-MovePeople",
"Report-EmergingThreats",
"Report-NewSubEvent",
"Report-ServiceAvailable"
]
highImportCategoriesShort = [
"GoodsServices",
"SearchAndRescue",
"MovePeople",
"EmergingThreats",
"NewSubEvent",
"ServiceAvailable"
]
# Priority map
priorityScoreMap = {
"Critical": 1.0,
"High": 0.75,
"Medium": 0.5,
"Low": 0.25,
"Unknown": 0.25,
}
# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected
# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
# '2015_09_28_hurricane_joaquin.2015',
# '2017_03_23_cyclone_debbie.2017',
# '2018_02_24_anticyclone_hartmut.2018',
# '2018_07_13_ferguson_wildfire.2018',
# '2018_07_23_cranston_wildfire.2018',
# '2018_09_07_hurricane_florence.2018',
# '2018_10_07_hurricane_michael.2018',
# '2019_09_17_tropicalstorm_imelda.2019',
# '2019_karnataka_floods',
# '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
# '2020_01_28_bar_shooting_nc.2020',
# '2020_02_07_rutherford_tn_floods.2020',
# '2020_05_26_edenville_dam_failure.2020.corrected',
# '2020_08_27_hurricane_laura.2020',
# '2020_09_11_hurricane_sally.2020',
# '2020_afghanistan_flood',
# '2020_hpakant_jade_mine_disaster',
# '2020_kerala_floods',
# 'T2020_02_03_texas_university_shooting.2020',
# 'UNASSIGNED',
# 'indonesia_earthquake.2019'
"2020_05_26_edenville_dam_failure.2020.corrected",
"2018_10_07_hurricane_michael.2018",
"2020_01_28_bar_shooting_nc.2020",
"T2020_02_03_texas_university_shooting.2020",
"2020_02_07_rutherford_tn_floods.2020",
"UNASSIGNED",
"indonesia_earthquake.2019",
"2015_09_28_hurricane_joaquin.2015",
"2017_03_23_cyclone_debbie.2017",
"2018_02_24_anticyclone_hartmut.2018",
"2018_07_13_ferguson_wildfire.2018",
"2018_07_23_cranston_wildfire.2018",
"2018_09_07_hurricane_florence.2018",
"2019_09_17_tropicalstorm_imelda.2019",
"2019_karnataka_floods",
"2019_spring_floods_in_ontario_quebec_and_new_brunswick",
"2020_08_27_hurricane_laura.2020",
"2020_09_11_hurricane_sally.2020",
"2020_afghanistan_flood",
"2020_hpakant_jade_mine_disaster",
"2020_kerala_floods",
]
import glob
runFile = None
for f in glob.glob("*.gz"):
runFile = f
print("Run File:", f)
Run File: run.json.gz
import gzip
import json
runName = None
with gzip.open(runFile, "r") as inRunFile:
for line in inRunFile:
line = line.decode("utf8")
# runName = line.rpartition("\t")[2].strip()
runName = json.loads(line)["runtag"]
break
print("Run Name:", runName)
Run Name: STrans-GaussianNB
# Do we try and normalize the run priority scores?
enablePriorityNorm = False
dataDir = "../../data/2021b"
# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir
# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
# "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
# "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
# "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
"%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]
# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir
topicArray = []
with open(topicsFile, "r") as inTopicsFile:
topicNum = None
topicDataset = None
for line_ in inTopicsFile:
line = line_.strip()
if line == "</top>":
if topicDataset in skipEvents:
continue
topicArray.append((topicDataset, topicNum))
if line.startswith("<num>"):
topicNum = line.partition("<num>")[2].partition("</num>")[0]
if line.startswith("<dataset>"):
topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
for row in topicArray:
print(row)
('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076') ('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080') ('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081') ('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082') ('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083') ('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085') ('2016_puttingal_temple', 'TRECIS-CTIT-H-089') ('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091') ('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092') ('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096') ('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097') ('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100') ('2018_maryland_flood', 'TRECIS-CTIT-H-101') ('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102') ('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103') ('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104') ('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106') ('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107') ('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108') ('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110') ('2019_townsville_flood', 'TRECIS-CTIT-H-112') ('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116') ('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119') ('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120') ('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121') ('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')
# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())
resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")
perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")
perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")
1
# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
# --------------------------------------------------
# Stage 1: Load the ground truth dataset
# --------------------------------------------------
groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
print("Reading "+groundtruthFile)
with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:
groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])
# --------------------------------------------------
# Stage 2: Load run file
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
# runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])
Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json
# --------------------------------------------------
# Stage 3: Load the categories
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:
ontologyJSON = json.load(ontologyJSONFile)
informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index
for informationTypeJSON in ontologyJSON["informationTypes"]:
informationTypeId = informationTypeJSON["id"]
informationTypeIndex = taskCategories.index(informationTypeId)
informationTypes2Index[informationTypeId] = informationTypeIndex
informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex
# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
# not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events
tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []
invertedPriorityScoreMap = {
v:k for k,v in priorityScoreMap.items()
}
tweetIndex = 0
for groundtruth in groundtruthJSON:
for eventJSON in groundtruth["events"]:
eventid = eventJSON["eventid"]
print(eventid)
if eventid in skipEvents:
continue
if not event2tweetIds.get(eventid):
event2tweetIds[eventid] = []
if any(eventid in s for s in eventIdentifiers):
# iterate over tweets in the event
for tweetJSON in eventJSON["tweets"]:
tweetid = tweetJSON["postID"]
categories = tweetJSON["postCategories"]
priority = tweetJSON["postPriority"]
if priority == "High" or priority == "Critical":
countHighCriticalImport = countHighCriticalImport + 1
if priority == "Low" or priority == "Medium":
countLowMediumImport = countLowMediumImport + 1
# check categories for name issues and correct if possible
cleanedCategories = []
highImportCats = []
lowImportCats = []
for categoryId in categories:
if not any(categoryId in s for s in informationTypesShort2Index.keys()):
# print("Found unknown category in ground truth "+categoryId+", ignoring...")
pass
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategoriesShort):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
if tweetid not in tweetsSeen:
event2tweetIds[eventid].append(tweetid)
tweetId2TRECInfoCategories[tweetid] = cleanedCategories
tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
tweetId2TRECPriorityCategory[tweetid] = priority
index2TweetId[tweetIndex] = tweetid;
tweetIndex = tweetIndex + 1
tweetsSeen.append(tweetid)
else:
tweetId2TRECInfoCategories[tweetid] = list(set(
cleanedCategories + tweetId2TRECInfoCategories[tweetid]
))
prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
thisPriorityScore = priorityScoreMap[priority]
tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
max(prePriorityScore, thisPriorityScore)
]
else:
print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")
2020_01_27_houston_explosion.2020 2020_01_28_bar_shooting_nc.2020 T2020_02_03_texas_university_shooting.2020 2020_02_07_rutherford_tn_floods.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 2020_05_26_edenville_dam_failure.2020.corrected brooklynblockparty_shooting.2019 UNASSIGNED indonesia_earthquake.2019 2015_09_28_hurricane_joaquin.2015 2016_puttingal_temple 2017_03_23_cyclone_debbie.2017 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_02_24_anticyclone_hartmut.2018 2018_07_13_ferguson_wildfire.2018 2018_07_23_cranston_wildfire.2018 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_09_07_hurricane_florence.2018 2018_10_07_hurricane_michael.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_09_17_tropicalstorm_imelda.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_karnataka_floods 2019_saugus_high_school_shooting 2019_spring_floods_in_ontario_quebec_and_new_brunswick 2019_townsville_flood 2020_08_27_hurricane_laura.2020 2020_09_11_hurricane_sally.2020 2020_afghanistan_flood 2020_easter_tornado_outbreak 2020_hpakant_jade_mine_disaster 2020_kerala_floods 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)
maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999
for predictionParts in runContents:
#print(runLine)
if (len(predictionParts)<6 ):
print(runLine)
continue
else:
eventId = predictionParts["topic"]
if eventId in skipEvents:
continue
tweetId = predictionParts["tweet_id"]
rank = 0
#print(predictionParts[5])
category_scores = predictionParts["info_type_scores"]
category_labels = predictionParts["info_type_labels"]
priority = float(predictionParts["priority"])
if priority > maxPrediction:
maxPrediction = priority
if priority < minPrediction:
minPrediction = priority
cleanedCategories = []
cleanedCategoriesProbs = []
highImportCats = []
lowImportCats = []
# Handle category flags
for catIndex, categoryLabel in enumerate(category_labels):
# check if we have a binary flag for this label
if categoryLabel == 0:
# False flag, so skip
continue
categoryId = taskCategories[catIndex]
if not any(categoryId in s for s in informationTypes2Index.keys()):
print("Found unknown category in run "+categoryId+", ignoring...")
else:
cleanedCategories.append(categoryId)
if any(categoryId in s for s in highImportCategories):
highImportCats.append(categoryId)
else:
lowImportCats.append(categoryId)
# Process category probabilities
for categoryProbability in category_scores:
if categoryProbability > maxCategory:
maxCategory = categoryProbability
if categoryProbability < minCategory:
minCategory = categoryProbability
cleanedCategoriesProbs.append(categoryProbability)
tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
tweetId2RunInfoCategories[tweetId] = cleanedCategories
tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
tweetId2RunPriorityScore[tweetId] = priority
if priority > priorityScoreMap["High"]:
tweetId2RunPriorityCategory[tweetId] = "Critical"
elif priority > priorityScoreMap["Medium"]:
tweetId2RunPriorityCategory[tweetId] = "High"
elif priority > priorityScoreMap["Low"]:
tweetId2RunPriorityCategory[tweetId] = "Medium"
else:
tweetId2RunPriorityCategory[tweetId] = "Low"
if not event2TweetIdRank.get(eventId):
event2TweetIdRank[eventId] = []
rankTuple = (tweetId,rank)
event2TweetIdRank.get(eventId).append(rankTuple)
for eventId in event2TweetIdRank.keys():
tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
event2TweetIdRank[eventId] = tweetsSorted
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunPriorityScore.get(tweetId):
if enablePriorityNorm:
if (minPrediction-minPrediction) == 0.0:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
else:
tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
else:
tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
else:
tweetId2RunPriorityScoreNorm[tweetId] = 0.0
# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------
category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
#pprint(categories)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
category2GroundTruth[categoryId] = categoryVector
#pprint(category2GroundTruth)
# --------------------------------------------------
# Stage 7: Create run vectors per category
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
# no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
if tweetId2RunInfoCategories.get(tweetId):
categories = tweetId2RunInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
else:
categoryVector.append(0)
category2Predicted[categoryId] = categoryVector
#pprint(category2Predicted)
# --------------------------------------------------
# Stage 8: Make event category vectors
# --------------------------------------------------
event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(eventId)
for tweetId in event2tweetIds.get(eventId):
# print(tweetId)
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2groundtruth[eventId] = eventCategories
event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
print(eventId)
eventCategories = {}
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
# print(tweetId)
for tweetId in event2tweetIds.get(eventId):
#print(tweetId)
categories = tweetId2RunInfoCategories.get(tweetId)
if categories == None:
categories = json.loads("[]")
tweetId2RunInfoCategories[tweetId] = categories
if any(categoryId in s for s in categories):
categoryVector.append(1)
else:
categoryVector.append(0)
eventCategories[categoryId] = categoryVector
event2prediction[eventId] = eventCategories
2020_01_27_houston_explosion.2020 2020_02_10_mideast_tornadoes.day1_mississipi.2020 2020_02_10_mideast_tornadoes.day2_al.2020 2020_02_10_mideast_tornadoes.day3_md.2019 2020_05_06_tn_derecho.2020 brooklynblockparty_shooting.2019 2016_puttingal_temple 2017_12_04_thomas_wildfire.2017 2017_12_07_lilac_wildfire.2017 2018_07_23_klamathon_wildfire.2018 2018_08_05_holy_wildfire.2018 2018_11_07_Woolsey_wildfire.2018 2018_maryland_flood 2018_pittsburgh_synagogue_shooting 2019_03_01_alberta_wildfire.2019.v2 2019_08_25_hurricane_dorian.2019 2019_10_10_saddleridge_wildfire.2019 2019_10_25_kincade_wildfire.2019 2019_durham_gas_explosion 2019_saugus_high_school_shooting 2019_townsville_flood 2020_easter_tornado_outbreak 2020_tornado_outbreak_of_april 2020_tornado_outbreak_of_march 2020_visakhapatnam_gas_leak tornado_outbreak_of_november_30_december_2018
# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------
category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
priorityVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
priority = tweetId2TRECPriorityCategory.get(tweetId)
priorityVector.append(priority)
category2GroundTruthPriority[categoryId] = priorityVector
category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores
for categoryId in informationTypes2Index.keys():
categoryIdShort = categoryId.split("-")[1]
categoryVector = []
categoryScoreVector = []
for i in range(len(index2TweetId)):
tweetId = index2TweetId[i]
categories = tweetId2TRECInfoCategories.get(tweetId)
if any(categoryIdShort in s for s in categories):
if tweetId2RunPriorityCategory.get(tweetId):
priority = tweetId2RunPriorityCategory.get(tweetId)
priorityScore = tweetId2RunPriorityScore.get(tweetId)
categoryVector.append(priority)
categoryScoreVector.append(priorityScore)
else:
categoryVector.append("Low") # default to low priority
categoryScoreVector.append(0.25)
category2PredictedPriority[categoryId] = categoryVector
category2PredictedPriorityScore[categoryId] = categoryScoreVector
# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories
# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------
import pandas as pd
def calc_dcg(scores, at_k=100):
position = 1
accumulator = 0.0
for score in scores[:at_k]:
numerator = 2 ** score - 1
denom = np.log2(position + 1)
accumulator += numerator / denom
position += 1
return accumulator
priority_map = {
"Unknown": 1,
"Low": 1,
"Medium": 2,
"High": 3,
"Critical": 4,
}
at_k = 100
tweetId2TRECPriorityCategory_score = {
k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
tweetId2TRECPriorityCategory_score.values(),
reverse=True
)
best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print(event)
tweetId2TRECPriorityCategory_scores_sorted = sorted(
[tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
reverse=True
)
ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
print("\tBest DCG:", ideal_dcg)
best_dcg_per_event[event] = ideal_dcg
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()
# Code below calculates the DCG for a system's
# ranked priority tweets. We have to do some
# sampling here to break ties among tweets with
# the same priority scores.
# Build a dataframe from the system's provided
# priority scores, so we can identify what the
# top-most priorities are and get a count of
# the number of tweets in each priority bin.
priority_df = pd.DataFrame(
[(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
columns=["tweet_id", "priority"]
)
# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
print("Event:", event)
local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
unique_scores = local_priority_df["priority"].value_counts()
# Find the top priority scores that would be included
# in the necessary at_k values.
total = 0
top_keys = []
candidates = {}
for top in sorted(unique_scores.index, reverse=True):
# We store this key, so we can go back and shuffle
#. tweets with this score.
top_keys.append(top)
local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
candidates[top] = list(local_restricted_df["tweet_id"])
total += local_restricted_df.shape[0]
# Once we have enough samples, stop.
if ( total > at_k ):
break
# Now we generate distribution over the DCG for this
# system and do this a number of times to remove
# dependence on our selection of the top k tweets
random_dcgs = []
for i in range(100):
local_tweet_ids = []
for top in top_keys:
this_top_tweets = candidates[top][:]
np.random.shuffle(this_top_tweets)
needed = at_k - len(local_tweet_ids)
local_tweet_ids.extend(this_top_tweets[:needed])
local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]
random_dcgs.append(calc_dcg(local_scores))
system_dcg = np.mean(random_dcgs)
system_ndcg_ = system_dcg / best_dcg_per_event[event]
print("\tnDCG:", system_ndcg_)
system_dcg_per_event[event] = system_ndcg_
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")
2020_01_27_houston_explosion.2020 Best DCG: 176.99559032459564 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Best DCG: 268.88459894996123 2020_02_10_mideast_tornadoes.day2_al.2020 Best DCG: 270.1716952398847 2020_02_10_mideast_tornadoes.day3_md.2019 Best DCG: 135.38775246204446 2020_05_06_tn_derecho.2020 Best DCG: 167.06354661312534 brooklynblockparty_shooting.2019 Best DCG: 179.1756130795261 2016_puttingal_temple Best DCG: 314.08006311421406 2017_12_04_thomas_wildfire.2017 Best DCG: 300.71399384300895 2017_12_07_lilac_wildfire.2017 Best DCG: 314.08006311421406 2018_07_23_klamathon_wildfire.2018 Best DCG: 221.46334445469358 2018_08_05_holy_wildfire.2018 Best DCG: 153.96993418707177 2018_11_07_Woolsey_wildfire.2018 Best DCG: 175.67469323453255 2018_maryland_flood Best DCG: 285.7119531591263 2018_pittsburgh_synagogue_shooting Best DCG: 111.85075929877581 2019_03_01_alberta_wildfire.2019.v2 Best DCG: 62.88708564345522 2019_08_25_hurricane_dorian.2019 Best DCG: 146.57069611996656 2019_10_10_saddleridge_wildfire.2019 Best DCG: 173.00802656786584 2019_10_25_kincade_wildfire.2019 Best DCG: 314.08006311421406 2019_durham_gas_explosion Best DCG: 201.07148118577902 2019_saugus_high_school_shooting Best DCG: 314.08006311421406 2019_townsville_flood Best DCG: 314.08006311421406 2020_easter_tornado_outbreak Best DCG: 214.9714167256293 2020_tornado_outbreak_of_april Best DCG: 314.08006311421406 2020_tornado_outbreak_of_march Best DCG: 267.51977363880474 2020_visakhapatnam_gas_leak Best DCG: 314.08006311421406 tornado_outbreak_of_november_30_december_2018 Best DCG: 314.08006311421406 Mean: 231.7589407554446 Event: 2020_01_27_houston_explosion.2020 nDCG: 0.2521290663344169 Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020 nDCG: 0.39187105517000154 Event: 2020_02_10_mideast_tornadoes.day2_al.2020 nDCG: 0.36325143319422737 Event: 2020_02_10_mideast_tornadoes.day3_md.2019 nDCG: 0.3072441473270432 Event: 2020_05_06_tn_derecho.2020 nDCG: 0.3648905016173573 Event: brooklynblockparty_shooting.2019 nDCG: 0.12473828382151113 Event: 2016_puttingal_temple nDCG: 0.23008515121464357 Event: 2017_12_04_thomas_wildfire.2017 nDCG: 0.2394525689309033 Event: 2017_12_07_lilac_wildfire.2017 nDCG: 0.2817468066690672 Event: 2018_07_23_klamathon_wildfire.2018 nDCG: 0.3870334000141965 Event: 2018_08_05_holy_wildfire.2018 nDCG: 0.3164778245171642 Event: 2018_11_07_Woolsey_wildfire.2018 nDCG: 0.26544580765844905 Event: 2018_maryland_flood nDCG: 0.23952976256868794 Event: 2018_pittsburgh_synagogue_shooting nDCG: 0.85195518472338 Event: 2019_03_01_alberta_wildfire.2019.v2 nDCG: 0.33713666033721296 Event: 2019_08_25_hurricane_dorian.2019 nDCG: 0.38495712225970063 Event: 2019_10_10_saddleridge_wildfire.2019 nDCG: 0.37578812483342056 Event: 2019_10_25_kincade_wildfire.2019 nDCG: 0.31032533865009204 Event: 2019_durham_gas_explosion nDCG: 0.2461146602232315 Event: 2019_saugus_high_school_shooting nDCG: 0.20273427200876964 Event: 2019_townsville_flood nDCG: 0.433700903054961 Event: 2020_easter_tornado_outbreak nDCG: 0.29376269794729887 Event: 2020_tornado_outbreak_of_april nDCG: 0.3496272662692206 Event: 2020_tornado_outbreak_of_march nDCG: 0.19671763406883652 Event: 2020_visakhapatnam_gas_leak nDCG: 0.42679551379873104 Event: tornado_outbreak_of_november_30_december_2018 nDCG: 0.584068211892998 System Event-Micro nDCG: 0.3368299768886739
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0
avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0
avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0
for categoryId in informationTypes2Index.keys():
categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
avgPrecision = avgPrecision + categoryPrecision
avgRecall = avgRecall + categoryRecall
avgF1 = avgF1 + categoryF1
avgAccuracy = avgAccuracy + categoryAccuracy
if any(categoryId in s for s in highImportCategories):
avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
avgRecallHigh = avgRecallHigh + categoryRecall
avgF1High = avgF1High + categoryF1
avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
else:
avgPrecisionLow = avgPrecisionLow + categoryPrecision
avgRecallLow = avgRecallLow + categoryRecall
avgF1Low = avgF1Low + categoryF1
avgAccuracyLow = avgAccuracyLow + categoryAccuracy
numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))
print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))
print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")
Information Type Precision (positive class, multi-type, macro): 0.20055299282746117 Information Type Recall (positive class, multi-type, macro): 0.49657537195941076 Information Type F1 (positive class, multi-type, macro): 0.2575175230528391 Information Type Accuracy (overall, multi-type, macro): 0.8473964507637484 High Importance Information Type Precision (positive class, multi-type, macro): 0.13847359059256995 High Importance Information Type Recall (positive class, multi-type, macro): 0.5151469334598603 High Importance Information Type F1 (positive class, multi-type, macro): 0.2082992920427631 High Importance Information Type Accuracy (overall, multi-type, macro): 0.9117913913412007 Low Importance Information Type Precision (positive class, multi-type, macro): 0.22015701458584785 Low Importance Information Type Recall (positive class, multi-type, macro): 0.4907106683276899 Low Importance Information Type F1 (positive class, multi-type, macro): 0.273060122319179 Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8270612063708688
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here,
# primaraly with respect to recall, as we want the user to
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)
from sklearn.metrics import classification_report
perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")
for categoryId in informationTypes2Index.keys():
target_names = ['Other Classes', categoryId]
try:
print(categoryId)
print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))
perTopicFile.write(categoryId+"\n")
perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
perTopicFile.write(""+"\n")
except ValueError:
print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")
CallToAction-Donations precision recall f1-score support Other Classes 1.00 0.98 0.99 55275 CallToAction-Donations 0.26 0.62 0.36 568 accuracy 0.98 55843 macro avg 0.63 0.80 0.68 55843 weighted avg 0.99 0.98 0.98 55843 CallToAction-MovePeople precision recall f1-score support Other Classes 0.99 0.95 0.97 54646 CallToAction-MovePeople 0.21 0.56 0.31 1197 accuracy 0.95 55843 macro avg 0.60 0.76 0.64 55843 weighted avg 0.97 0.95 0.96 55843 CallToAction-Volunteer precision recall f1-score support Other Classes 1.00 0.96 0.98 55543 CallToAction-Volunteer 0.07 0.57 0.13 300 accuracy 0.96 55843 macro avg 0.54 0.76 0.55 55843 weighted avg 0.99 0.96 0.97 55843 Other-Advice precision recall f1-score support Other Classes 0.97 0.81 0.89 52602 Other-Advice 0.17 0.63 0.27 3241 accuracy 0.80 55843 macro avg 0.57 0.72 0.58 55843 weighted avg 0.93 0.80 0.85 55843 Other-ContextualInformation precision recall f1-score support Other Classes 0.97 0.97 0.97 54346 Other-ContextualInformation 0.02 0.03 0.03 1497 accuracy 0.94 55843 macro avg 0.50 0.50 0.50 55843 weighted avg 0.95 0.94 0.94 55843 Other-Discussion precision recall f1-score support Other Classes 0.99 0.95 0.97 55263 Other-Discussion 0.02 0.11 0.04 580 accuracy 0.94 55843 macro avg 0.51 0.53 0.50 55843 weighted avg 0.98 0.94 0.96 55843 Other-Irrelevant precision recall f1-score support Other Classes 0.63 0.79 0.70 23267 Other-Irrelevant 0.82 0.66 0.73 32576 accuracy 0.72 55843 macro avg 0.72 0.73 0.71 55843 weighted avg 0.74 0.72 0.72 55843 Other-Sentiment precision recall f1-score support Other Classes 0.93 0.96 0.94 51270 Other-Sentiment 0.33 0.24 0.27 4573 accuracy 0.90 55843 macro avg 0.63 0.60 0.61 55843 weighted avg 0.88 0.90 0.89 55843 Report-CleanUp precision recall f1-score support Other Classes 1.00 0.80 0.89 55581 Report-CleanUp 0.01 0.65 0.03 262 accuracy 0.80 55843 macro avg 0.51 0.72 0.46 55843 weighted avg 0.99 0.80 0.88 55843 Report-EmergingThreats precision recall f1-score support Other Classes 0.98 0.80 0.88 52454 Report-EmergingThreats 0.18 0.71 0.29 3389 accuracy 0.79 55843 macro avg 0.58 0.75 0.59 55843 weighted avg 0.93 0.79 0.84 55843 Report-Factoid precision recall f1-score support Other Classes 0.95 0.90 0.93 49844 Report-Factoid 0.42 0.59 0.49 5999 accuracy 0.87 55843 macro avg 0.69 0.75 0.71 55843 weighted avg 0.89 0.87 0.88 55843 Report-FirstPartyObservation precision recall f1-score support Other Classes 0.97 0.92 0.95 54135 Report-FirstPartyObservation 0.07 0.18 0.10 1708 accuracy 0.90 55843 macro avg 0.52 0.55 0.52 55843 weighted avg 0.95 0.90 0.92 55843 Report-Hashtags precision recall f1-score support Other Classes 0.89 0.66 0.76 48407 Report-Hashtags 0.18 0.49 0.27 7436 accuracy 0.64 55843 macro avg 0.54 0.58 0.51 55843 weighted avg 0.80 0.64 0.70 55843 Report-Location precision recall f1-score support Other Classes 0.84 0.73 0.78 41325 Report-Location 0.44 0.61 0.51 14518 accuracy 0.70 55843 macro avg 0.64 0.67 0.64 55843 weighted avg 0.74 0.70 0.71 55843 Report-MultimediaShare precision recall f1-score support Other Classes 0.93 0.70 0.80 48784 Report-MultimediaShare 0.24 0.63 0.34 7059 accuracy 0.70 55843 macro avg 0.58 0.67 0.57 55843 weighted avg 0.84 0.70 0.74 55843 Report-News precision recall f1-score support Other Classes 0.96 0.71 0.82 50324 Report-News 0.22 0.72 0.33 5519 accuracy 0.71 55843 macro avg 0.59 0.72 0.57 55843 weighted avg 0.89 0.71 0.77 55843 Report-NewSubEvent precision recall f1-score support Other Classes 0.99 0.88 0.93 54728 Report-NewSubEvent 0.07 0.44 0.12 1115 accuracy 0.87 55843 macro avg 0.53 0.66 0.52 55843 weighted avg 0.97 0.87 0.91 55843 Report-Official precision recall f1-score support Other Classes 0.98 0.75 0.85 53203 Report-Official 0.11 0.65 0.19 2640 accuracy 0.74 55843 macro avg 0.54 0.70 0.52 55843 weighted avg 0.94 0.74 0.81 55843 Report-OriginalEvent precision recall f1-score support Other Classes 0.95 0.92 0.94 52838 Report-OriginalEvent 0.14 0.23 0.17 3005 accuracy 0.88 55843 macro avg 0.55 0.57 0.56 55843 weighted avg 0.91 0.88 0.90 55843 Report-ServiceAvailable precision recall f1-score support Other Classes 0.98 0.94 0.96 53834 Report-ServiceAvailable 0.26 0.52 0.35 2009 accuracy 0.93 55843 macro avg 0.62 0.73 0.65 55843 weighted avg 0.96 0.93 0.94 55843 Report-ThirdPartyObservation precision recall f1-score support Other Classes 0.94 0.78 0.85 50379 Report-ThirdPartyObservation 0.21 0.52 0.30 5464 accuracy 0.76 55843 macro avg 0.57 0.65 0.58 55843 weighted avg 0.87 0.76 0.80 55843 Report-Weather precision recall f1-score support Other Classes 0.97 0.88 0.93 50824 Report-Weather 0.39 0.76 0.52 5019 accuracy 0.87 55843 macro avg 0.68 0.82 0.72 55843 weighted avg 0.92 0.87 0.89 55843 Request-GoodsServices precision recall f1-score support Other Classes 1.00 0.96 0.98 55452 Request-GoodsServices 0.07 0.38 0.12 391 accuracy 0.96 55843 macro avg 0.53 0.67 0.55 55843 weighted avg 0.99 0.96 0.97 55843 Request-InformationWanted precision recall f1-score support Other Classes 0.99 0.92 0.96 55241 Request-InformationWanted 0.06 0.43 0.10 602 accuracy 0.92 55843 macro avg 0.53 0.68 0.53 55843 weighted avg 0.98 0.92 0.95 55843 Request-SearchAndRescue precision recall f1-score support Other Classes 1.00 0.98 0.99 55737 Request-SearchAndRescue 0.04 0.49 0.07 106 accuracy 0.97 55843 macro avg 0.52 0.73 0.53 55843 weighted avg 1.00 0.97 0.99 55843
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)
N = len(informationTypes2Index)
ind = np.arange(N)
scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
print(categoryId, localF1Score)
scoresPerCategoryF1.append(localF1Score)
categoryLabels.append(categoryId)
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerCategoryF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
CallToAction-Donations 0.3620422898401238 CallToAction-MovePeople 0.3063810181985718 CallToAction-Volunteer 0.1287878787878788 Other-Advice 0.27147107438016527 Other-ContextualInformation 0.027243115190997928 Other-Discussion 0.03633314700950251 Other-Irrelevant 0.7304507195221286 Other-Sentiment 0.27492715063980744 Report-CleanUp 0.028903711304942705 Report-EmergingThreats 0.29268888075186134 Report-Factoid 0.49172081324669886 Report-FirstPartyObservation 0.09974667511082963 Report-Hashtags 0.2669488433189812 Report-Location 0.5106542271366044 Report-MultimediaShare 0.3442376603860825 Report-News 0.3314475873544093 Report-NewSubEvent 0.11747919143876337 Report-Official 0.19145490086567998 Report-OriginalEvent 0.17373634747269492 Report-ServiceAvailable 0.3461922181576322 Report-ThirdPartyObservation 0.2958276691414828 Report-Weather 0.519815418023887 Request-GoodsServices 0.1184528605962933 Request-InformationWanted 0.10239309533150255 Request-SearchAndRescue 0.06860158311345647
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)
perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")
for eventId in eventIdentifiers:
tavgPrecision = 0.0
tavgRecall = 0.0
tavgF1 = 0.0
tavgAccuracy = 0.0
categoryCount = 0
for categoryId in informationTypes2Index.keys():
if sum(event2groundtruth[eventId].get(categoryId)) == 0:
continue
categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
tavgPrecision = tavgPrecision + categoryPrecision
tavgRecall = tavgRecall + categoryRecall
tavgF1 = tavgF1 + categoryF1
tavgAccuracy = tavgAccuracy + categoryAccuracy
categoryCount += 1
if categoryCount == 0:
print("No categories for event:", eventId)
continue
print(eventId)
print(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
print(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
print(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
print(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
print("")
perEventFile.write(eventId+"\n")
perEventFile.write(" Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
perEventFile.write(" Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
perEventFile.write("\n")
perEventFile.write("\n")
2020_01_27_houston_explosion.2020 Information Type Precision (positive class, multi-type, macro): 0.17087550157900305 Information Type Recall (positive class, multi-type, macro): 0.46878749672493863 Information Type F1 (positive class, multi-type, macro): 0.2133156212400501 Information Type Accuracy (overall, multi-type, macro): 0.8485027611417904 2020_02_10_mideast_tornadoes.day1_mississipi.2020 Information Type Precision (positive class, multi-type, macro): 0.3841664427918161 Information Type Recall (positive class, multi-type, macro): 0.617272080382134 Information Type F1 (positive class, multi-type, macro): 0.4121225378184968 Information Type Accuracy (overall, multi-type, macro): 0.7091097308488613
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2020_02_10_mideast_tornadoes.day2_al.2020 Information Type Precision (positive class, multi-type, macro): 0.2124431276170895 Information Type Recall (positive class, multi-type, macro): 0.5355697086643229 Information Type F1 (positive class, multi-type, macro): 0.26385366038592495 Information Type Accuracy (overall, multi-type, macro): 0.8607381492687846 2020_02_10_mideast_tornadoes.day3_md.2019 Information Type Precision (positive class, multi-type, macro): 0.13035178508785672 Information Type Recall (positive class, multi-type, macro): 0.6683293138271014 Information Type F1 (positive class, multi-type, macro): 0.17081506589760942 Information Type Accuracy (overall, multi-type, macro): 0.8149431818181818 2020_05_06_tn_derecho.2020 Information Type Precision (positive class, multi-type, macro): 0.22328019510892208 Information Type Recall (positive class, multi-type, macro): 0.5057326419844612 Information Type F1 (positive class, multi-type, macro): 0.25845227677040944 Information Type Accuracy (overall, multi-type, macro): 0.8469584245076587 brooklynblockparty_shooting.2019 Information Type Precision (positive class, multi-type, macro): 0.21104361101684735 Information Type Recall (positive class, multi-type, macro): 0.42316478342334707 Information Type F1 (positive class, multi-type, macro): 0.2165237184907205 Information Type Accuracy (overall, multi-type, macro): 0.9090041361756284
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2016_puttingal_temple Information Type Precision (positive class, multi-type, macro): 0.1648629016679452 Information Type Recall (positive class, multi-type, macro): 0.32077840313908146 Information Type F1 (positive class, multi-type, macro): 0.17599446535043303 Information Type Accuracy (overall, multi-type, macro): 0.8637778100916098 2017_12_04_thomas_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.2142825559478267 Information Type Recall (positive class, multi-type, macro): 0.49101063121936206 Information Type F1 (positive class, multi-type, macro): 0.2664328926428016 Information Type Accuracy (overall, multi-type, macro): 0.7911096389532958 2017_12_07_lilac_wildfire.2017 Information Type Precision (positive class, multi-type, macro): 0.22968116279426837 Information Type Recall (positive class, multi-type, macro): 0.5106167943287692 Information Type F1 (positive class, multi-type, macro): 0.2773451422242404 Information Type Accuracy (overall, multi-type, macro): 0.8079723899913718 2018_07_23_klamathon_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.26766106246131244 Information Type Recall (positive class, multi-type, macro): 0.5082315563647082 Information Type F1 (positive class, multi-type, macro): 0.2953607799708363 Information Type Accuracy (overall, multi-type, macro): 0.8045812848471532 2018_08_05_holy_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.16333947181521682 Information Type Recall (positive class, multi-type, macro): 0.6387355674914105 Information Type F1 (positive class, multi-type, macro): 0.2040389476619656 Information Type Accuracy (overall, multi-type, macro): 0.9143421664342165 2018_11_07_Woolsey_wildfire.2018 Information Type Precision (positive class, multi-type, macro): 0.14997437526837767 Information Type Recall (positive class, multi-type, macro): 0.43843167705392005 Information Type F1 (positive class, multi-type, macro): 0.18310243084696703 Information Type Accuracy (overall, multi-type, macro): 0.8113168511430641 2018_maryland_flood Information Type Precision (positive class, multi-type, macro): 0.2373032633421701 Information Type Recall (positive class, multi-type, macro): 0.5422747940206171 Information Type F1 (positive class, multi-type, macro): 0.262179401117709 Information Type Accuracy (overall, multi-type, macro): 0.8064468321600592 2018_pittsburgh_synagogue_shooting Information Type Precision (positive class, multi-type, macro): 0.36244294013901857 Information Type Recall (positive class, multi-type, macro): 0.42030651340996167 Information Type F1 (positive class, multi-type, macro): 0.3692284666036876 Information Type Accuracy (overall, multi-type, macro): 0.7542735042735044 2019_03_01_alberta_wildfire.2019.v2 Information Type Precision (positive class, multi-type, macro): 0.09543739016762737 Information Type Recall (positive class, multi-type, macro): 0.465186263325597 Information Type F1 (positive class, multi-type, macro): 0.07830051051177364 Information Type Accuracy (overall, multi-type, macro): 0.834233810977997
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
2019_08_25_hurricane_dorian.2019 Information Type Precision (positive class, multi-type, macro): 0.17686498410344328 Information Type Recall (positive class, multi-type, macro): 0.46117097501206034 Information Type F1 (positive class, multi-type, macro): 0.19742631714561665 Information Type Accuracy (overall, multi-type, macro): 0.7831648715824361 2019_10_10_saddleridge_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.24756125699426482 Information Type Recall (positive class, multi-type, macro): 0.5770035825357456 Information Type F1 (positive class, multi-type, macro): 0.2771912999875405 Information Type Accuracy (overall, multi-type, macro): 0.86521217808362 2019_10_25_kincade_wildfire.2019 Information Type Precision (positive class, multi-type, macro): 0.21151709779177946 Information Type Recall (positive class, multi-type, macro): 0.5622836913273612 Information Type F1 (positive class, multi-type, macro): 0.2634782608271622 Information Type Accuracy (overall, multi-type, macro): 0.8355860554488059 2019_durham_gas_explosion Information Type Precision (positive class, multi-type, macro): 0.22687807218142872 Information Type Recall (positive class, multi-type, macro): 0.4673077197690041 Information Type F1 (positive class, multi-type, macro): 0.2656073318349859 Information Type Accuracy (overall, multi-type, macro): 0.8395776566757491 2019_saugus_high_school_shooting Information Type Precision (positive class, multi-type, macro): 0.2182932122142716 Information Type Recall (positive class, multi-type, macro): 0.23034182842435993 Information Type F1 (positive class, multi-type, macro): 0.19072532844146953 Information Type Accuracy (overall, multi-type, macro): 0.8992222129296649 2019_townsville_flood Information Type Precision (positive class, multi-type, macro): 0.2297671507548774 Information Type Recall (positive class, multi-type, macro): 0.551911862262854 Information Type F1 (positive class, multi-type, macro): 0.2678095728416979 Information Type Accuracy (overall, multi-type, macro): 0.7838111478205266 2020_easter_tornado_outbreak Information Type Precision (positive class, multi-type, macro): 0.14593035703644255 Information Type Recall (positive class, multi-type, macro): 0.5668682728409737 Information Type F1 (positive class, multi-type, macro): 0.17415554387368212 Information Type Accuracy (overall, multi-type, macro): 0.7972553973357833 2020_tornado_outbreak_of_april Information Type Precision (positive class, multi-type, macro): 0.2101013247748719 Information Type Recall (positive class, multi-type, macro): 0.5188508659075042 Information Type F1 (positive class, multi-type, macro): 0.24615600845455557 Information Type Accuracy (overall, multi-type, macro): 0.809424144986846 2020_tornado_outbreak_of_march Information Type Precision (positive class, multi-type, macro): 0.17140154086538936 Information Type Recall (positive class, multi-type, macro): 0.6018108816420129 Information Type F1 (positive class, multi-type, macro): 0.21637268023200062 Information Type Accuracy (overall, multi-type, macro): 0.7965002382591201 2020_visakhapatnam_gas_leak Information Type Precision (positive class, multi-type, macro): 0.2426888137748399 Information Type Recall (positive class, multi-type, macro): 0.25247231341418735 Information Type F1 (positive class, multi-type, macro): 0.16773433121987766 Information Type Accuracy (overall, multi-type, macro): 0.8147957568081062 tornado_outbreak_of_november_30_december_2018 Information Type Precision (positive class, multi-type, macro): 0.21481443662908753 Information Type Recall (positive class, multi-type, macro): 0.6002203568019902 Information Type F1 (positive class, multi-type, macro): 0.2595363885974751 Information Type Accuracy (overall, multi-type, macro): 0.8713050811722539
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)
N = len(eventIdentifiers)
ind = np.arange(N)
scoresPerEventF1 = []
for eventId in eventIdentifiers:
avgF1_ = 0.0
for categoryId in informationTypes2Index.keys():
avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, scoresPerEventF1, width)
plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf( /Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)
from sklearn.metrics import mean_squared_error
priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
priorityAvgf1 = priorityAvgf1 + f1;
if any(categoryId in s for s in highImportCategories):
priorityAvgf1High = priorityAvgf1High + f1
else:
priorityAvgf1Low = priorityAvgf1Low + f1
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")
Priority Label Prediction (F1, macro): 0.17120951398373774
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases
priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
if categoryId == "Other-Irrelevant":
continue
groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
predictedPriorities = category2PredictedPriorityScore[categoryId]
# Pathological case when no variation exists in the predictions needs to be handled
this_corr = 0.0
if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
priorityAvgCorr = priorityAvgCorr + this_corr
if any(categoryId in s for s in highImportCategories):
priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
else:
priorityAvgCorrLow = priorityAvgCorrLow + this_corr
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")
Priority Score Prediction (Pearson): 0.1475708306012338 Priority Score Prediction, High (Pearson): 0.10963466884896583 Priority Score Prediction, Low (Pearson): 0.16021621785198978
1
# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)
N = len(informationTypes2Index)
ind = np.arange(N)
priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
groundTruthPriorities = category2GroundTruthPriority[categoryId]
predictedPriorities = category2PredictedPriority[categoryId]
priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
if (math.isnan(priorityCatF1)):
priorityCatF1 = 0.0
categoryLabels.append(categoryId)
priorityCatF1Values.append(priorityCatF1);
width = 0.90 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, priorityCatF1Values, width)
plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))
plt.show()
resultLine = None
# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")
resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
" & "+
str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',avgF1/numInformationTypes)+
" & "+
str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
" & "+
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
" & "+
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
" \\\\")
print(runName+" & "+resultLine)
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")
Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\ STrans-GaussianNB & 0.3368 & 0.2083 & 0.2575 & 0.8474 & 0.1959 & 0.1712 & 0.1096 & 0.1417 \\
93
# Done
resultsFile.close()
perTopicFile.close()
perEventFile.close()
# header = [
# "Run",
# "date",
# "team",
# "description",
# "paper",
# "code",
# "nDCG@100",
# "Info-Type F1 [Actionable]",
# "Info-Type F1 [All]",
# "Info-Type Accuracy",
# "Priority F1 [Actionable]",
# "Priority F1 [All]",
# "Priority R [Actionable]",
# "Priority R [All]",
# ]
import csv
if os.path.isfile("metadata.json"):
this_cwd = os.getcwd()
sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
leaderboard_entry = None
with open("metadata.json", "r") as in_file:
metadata = json.load(in_file)
leaderboard_entry = [
runName,
sub_date,
metadata["organization"].lower(),
metadata["model_description"],
metadata["paper"] if metadata["paper"].startswith("http") else "",
metadata["code"] if metadata["code"].startswith("http") else "",
str.format('{0:.4f}',system_ndcg_micro),
str.format('{0:.4f}',avgF1High/numHighInformationTypes),
str.format('{0:.4f}',avgF1/numInformationTypes),
str.format('{0:.4f}',avgAccuracy/numInformationTypes),
str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
]
with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
leader_writer = csv.writer(csvResultsFile)
leader_writer.writerow(leaderboard_entry)