# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"

import os
cwd = os.getcwd()


# Configuration Information

# Do we try and normalize the run priority scores?
enablePriorityNorm = True

# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5

taskCategories = [
    "CallToAction-Donations",
    "CallToAction-MovePeople",
    "CallToAction-Volunteer",
    "Other-Advice",
    "Other-ContextualInformation",
    "Other-Discussion",
    "Other-Irrelevant",
    "Other-Sentiment",
    "Report-CleanUp",
    "Report-EmergingThreats",
    "Report-Factoid",
    "Report-FirstPartyObservation",
    "Report-Hashtags",
    "Report-Location",
    "Report-MultimediaShare",
    "Report-News",
    "Report-NewSubEvent",
    "Report-Official",
    "Report-OriginalEvent",
    "Report-ServiceAvailable",
    "Report-ThirdPartyObservation",
    "Report-Weather",
    "Request-GoodsServices",
    "Request-InformationWanted",
    "Request-SearchAndRescue",
]

# What we consider to be highly important categories of information
highImportCategories = [
    "Request-GoodsServices",
    "Request-SearchAndRescue",
    "CallToAction-MovePeople",
    "Report-EmergingThreats",
    "Report-NewSubEvent",
    "Report-ServiceAvailable"
]

highImportCategoriesShort = [
    "GoodsServices",
    "SearchAndRescue",
    "MovePeople",
    "EmergingThreats",
    "NewSubEvent",
    "ServiceAvailable"
]

# Priority map
priorityScoreMap = {
    "Critical": 1.0,
    "High": 0.75,
    "Medium": 0.5,
    "Low": 0.25,
    "Unknown": 0.25,
}

# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected


# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
#     '2015_09_28_hurricane_joaquin.2015',
#     '2017_03_23_cyclone_debbie.2017',
#     '2018_02_24_anticyclone_hartmut.2018',
#     '2018_07_13_ferguson_wildfire.2018',
#     '2018_07_23_cranston_wildfire.2018',
#     '2018_09_07_hurricane_florence.2018',
#     '2018_10_07_hurricane_michael.2018',
#     '2019_09_17_tropicalstorm_imelda.2019',
#     '2019_karnataka_floods',
#     '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
#     '2020_01_28_bar_shooting_nc.2020',
#     '2020_02_07_rutherford_tn_floods.2020',
#     '2020_05_26_edenville_dam_failure.2020.corrected',
#     '2020_08_27_hurricane_laura.2020',
#     '2020_09_11_hurricane_sally.2020',
#     '2020_afghanistan_flood',
#     '2020_hpakant_jade_mine_disaster',
#     '2020_kerala_floods',
#     'T2020_02_03_texas_university_shooting.2020',
#     'UNASSIGNED',
#     'indonesia_earthquake.2019'
    
    "2020_05_26_edenville_dam_failure.2020.corrected",
    "2018_10_07_hurricane_michael.2018",
    "2020_01_28_bar_shooting_nc.2020",
    "T2020_02_03_texas_university_shooting.2020",
    "2020_02_07_rutherford_tn_floods.2020",
    "UNASSIGNED",
    "indonesia_earthquake.2019",
    "2015_09_28_hurricane_joaquin.2015",
    "2017_03_23_cyclone_debbie.2017",
    "2018_02_24_anticyclone_hartmut.2018",
    "2018_07_13_ferguson_wildfire.2018",
    "2018_07_23_cranston_wildfire.2018",
    "2018_09_07_hurricane_florence.2018",
    "2019_09_17_tropicalstorm_imelda.2019",
    "2019_karnataka_floods",
    "2019_spring_floods_in_ontario_quebec_and_new_brunswick",
    "2020_08_27_hurricane_laura.2020",
    "2020_09_11_hurricane_sally.2020",
    "2020_afghanistan_flood",
    "2020_hpakant_jade_mine_disaster",
    "2020_kerala_floods",
]


import glob

runFile = None
for f in glob.glob("*.gz"):
    runFile = f

print("Run File:", f)

Run File: run.json.gz


import gzip
import json


runName = None

with gzip.open(runFile, "r") as inRunFile:
    for line in inRunFile:
        line = line.decode("utf8")
#         runName = line.rpartition("\t")[2].strip()
        runName = json.loads(line)["runtag"]
        break

print("Run Name:", runName)

Run Name: STrans-GaussianNB


# Do we try and normalize the run priority scores?
enablePriorityNorm = False

dataDir = "../../data/2021b"

# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir

# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
#     "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
#     "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
#     "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
    "%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]

# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir


topicArray = []

with open(topicsFile, "r") as inTopicsFile:
    
    topicNum = None
    topicDataset = None
    
    for line_ in inTopicsFile:
        line = line_.strip()
        
        if line == "</top>":
            if topicDataset in skipEvents:
                continue
            topicArray.append((topicDataset, topicNum))
            
        if line.startswith("<num>"):
            topicNum = line.partition("<num>")[2].partition("</num>")[0]
            
        if line.startswith("<dataset>"):
            topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
            
for row in topicArray:
    print(row)

('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076')
('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080')
('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081')
('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082')
('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083')
('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085')
('2016_puttingal_temple', 'TRECIS-CTIT-H-089')
('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091')
('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092')
('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096')
('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097')
('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100')
('2018_maryland_flood', 'TRECIS-CTIT-H-101')
('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102')
('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103')
('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104')
('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106')
('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107')
('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108')
('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110')
('2019_townsville_flood', 'TRECIS-CTIT-H-112')
('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116')
('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119')
('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120')
('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121')
('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')


# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())

resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")

perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")

perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")

1


# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# --------------------------------------------------
# Stage 1: Load the ground truth dataset 
# --------------------------------------------------

groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
    print("Reading "+groundtruthFile)
    with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:    
        groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])

# --------------------------------------------------
# Stage 2: Load run file 
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
#     runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
    runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])

Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json


# --------------------------------------------------
# Stage 3: Load the categories 
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:    
    ontologyJSON = json.load(ontologyJSONFile)

informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index

for informationTypeJSON in ontologyJSON["informationTypes"]:
    informationTypeId = informationTypeJSON["id"]
    
    informationTypeIndex = taskCategories.index(informationTypeId)
    informationTypes2Index[informationTypeId] = informationTypeIndex
    informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex


# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
#        not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events

tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []


invertedPriorityScoreMap = {
    v:k for k,v in priorityScoreMap.items()
}

tweetIndex = 0
for groundtruth in groundtruthJSON:
    for eventJSON in groundtruth["events"]:
        eventid = eventJSON["eventid"]
        print(eventid)
        
        if eventid in skipEvents:
            continue
        
        if not event2tweetIds.get(eventid):
            event2tweetIds[eventid] = []
        
        if any(eventid in s for s in eventIdentifiers):
            # iterate over tweets in the event
            for tweetJSON in eventJSON["tweets"]:
                tweetid = tweetJSON["postID"]
                categories = tweetJSON["postCategories"]
                priority = tweetJSON["postPriority"]
                
                if priority == "High" or priority == "Critical":
                    countHighCriticalImport = countHighCriticalImport + 1
                
                if priority == "Low" or priority == "Medium":
                    countLowMediumImport = countLowMediumImport + 1
                
                # check categories for name issues and correct if possible
                cleanedCategories = []
                highImportCats = []
                lowImportCats = []
                for categoryId in categories:
                    if not any(categoryId in s for s in informationTypesShort2Index.keys()):
#                         print("Found unknown category in ground truth "+categoryId+", ignoring...")
                        pass
                    else:
                        cleanedCategories.append(categoryId)
                        if any(categoryId in s for s in highImportCategoriesShort):
                            highImportCats.append(categoryId)
                        else:
                            lowImportCats.append(categoryId)
    
                if tweetid not in tweetsSeen:
                    event2tweetIds[eventid].append(tweetid)
                    tweetId2TRECInfoCategories[tweetid] = cleanedCategories
                    tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
                    tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
                    tweetId2TRECPriorityCategory[tweetid] = priority
                    index2TweetId[tweetIndex] = tweetid;
                    tweetIndex = tweetIndex + 1
                    tweetsSeen.append(tweetid)

                else:
                    tweetId2TRECInfoCategories[tweetid] = list(set(
                        cleanedCategories + tweetId2TRECInfoCategories[tweetid]
                    ))
                    
                    prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
                    thisPriorityScore = priorityScoreMap[priority]
                    
                    tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
                        max(prePriorityScore, thisPriorityScore)
                    ]

                
        else:
            print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")

2020_01_27_houston_explosion.2020
2020_01_28_bar_shooting_nc.2020
T2020_02_03_texas_university_shooting.2020
2020_02_07_rutherford_tn_floods.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
2020_05_26_edenville_dam_failure.2020.corrected
brooklynblockparty_shooting.2019
UNASSIGNED
indonesia_earthquake.2019
2015_09_28_hurricane_joaquin.2015
2016_puttingal_temple
2017_03_23_cyclone_debbie.2017
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_02_24_anticyclone_hartmut.2018
2018_07_13_ferguson_wildfire.2018
2018_07_23_cranston_wildfire.2018
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_09_07_hurricane_florence.2018
2018_10_07_hurricane_michael.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_09_17_tropicalstorm_imelda.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_karnataka_floods
2019_saugus_high_school_shooting
2019_spring_floods_in_ontario_quebec_and_new_brunswick
2019_townsville_flood
2020_08_27_hurricane_laura.2020
2020_09_11_hurricane_sally.2020
2020_afghanistan_flood
2020_easter_tornado_outbreak
2020_hpakant_jade_mine_disaster
2020_kerala_floods
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)

maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999

for predictionParts in runContents:
    
    #print(runLine)
    if (len(predictionParts)<6 ):
        print(runLine)
        continue
    else:
        eventId = predictionParts["topic"]
        
        if eventId in skipEvents:
            continue
        
        tweetId = predictionParts["tweet_id"]
        rank = 0
        #print(predictionParts[5])

        category_scores = predictionParts["info_type_scores"]
        category_labels = predictionParts["info_type_labels"]

        priority = float(predictionParts["priority"])
        
        if priority > maxPrediction:
            maxPrediction = priority
        if priority < minPrediction:
            minPrediction = priority
        
        cleanedCategories = []
        cleanedCategoriesProbs = []
        highImportCats = []
        lowImportCats = []
        
        # Handle category flags
        for catIndex, categoryLabel in enumerate(category_labels):
            # check if we have a binary flag for this label
            if categoryLabel == 0:
                # False flag, so skip
                continue
                
            categoryId = taskCategories[catIndex]
            
            if not any(categoryId in s for s in informationTypes2Index.keys()):
                print("Found unknown category in run "+categoryId+", ignoring...")
            else:
                cleanedCategories.append(categoryId)
                if any(categoryId in s for s in highImportCategories):
                    highImportCats.append(categoryId)
                else:
                    lowImportCats.append(categoryId)
                    
        # Process category probabilities
        for categoryProbability in category_scores:
            
            if categoryProbability > maxCategory:
                maxCategory = categoryProbability
            if categoryProbability < minCategory:
                minCategory = categoryProbability
            
            cleanedCategoriesProbs.append(categoryProbability)
                
        tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
        tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
        tweetId2RunInfoCategories[tweetId] = cleanedCategories
        tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
        tweetId2RunPriorityScore[tweetId] = priority
        
        if priority > priorityScoreMap["High"]:
            tweetId2RunPriorityCategory[tweetId] = "Critical"
        elif priority > priorityScoreMap["Medium"]:
            tweetId2RunPriorityCategory[tweetId] = "High"
        elif priority > priorityScoreMap["Low"]:
            tweetId2RunPriorityCategory[tweetId] = "Medium"
        else:
            tweetId2RunPriorityCategory[tweetId] = "Low"
        
        if not event2TweetIdRank.get(eventId):
            event2TweetIdRank[eventId] = []
        rankTuple = (tweetId,rank)
        event2TweetIdRank.get(eventId).append(rankTuple)


for eventId in event2TweetIdRank.keys():
    tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
    event2TweetIdRank[eventId] = tweetsSorted
    
for i in range(len(index2TweetId)):
    tweetId = index2TweetId[i]
    if tweetId2RunPriorityScore.get(tweetId):
        
        if enablePriorityNorm:
            if (minPrediction-minPrediction) == 0.0:
                tweetId2RunPriorityScoreNorm[tweetId] = 0.0
            else:
                tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
        else:
            tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
    else:
        tweetId2RunPriorityScoreNorm[tweetId] = 0.0


# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------

category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        #pprint(categories)
        if any(categoryIdShort in s for s in categories):
            categoryVector.append(1)
        else:
            categoryVector.append(0)
    category2GroundTruth[categoryId] = categoryVector
            
#pprint(category2GroundTruth)


# --------------------------------------------------
# Stage 7: Create run vectors per category 
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
#              no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        
        if tweetId2RunInfoCategories.get(tweetId):
            categories = tweetId2RunInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
        else:
            categoryVector.append(0)

    category2Predicted[categoryId] = categoryVector

#pprint(category2Predicted)


# --------------------------------------------------
# Stage 8: Make event category vectors 
# --------------------------------------------------

event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(eventId)
        for tweetId in event2tweetIds.get(eventId):
#             print(tweetId)
            categories = tweetId2TRECInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2groundtruth[eventId] = eventCategories
    

event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
    print(eventId)
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(tweetId)
        for tweetId in event2tweetIds.get(eventId):
            #print(tweetId)
            categories = tweetId2RunInfoCategories.get(tweetId)
            
            if categories == None:
                categories = json.loads("[]")
                tweetId2RunInfoCategories[tweetId] = categories
            
            if any(categoryId in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2prediction[eventId] = eventCategories

2020_01_27_houston_explosion.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
brooklynblockparty_shooting.2019
2016_puttingal_temple
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_saugus_high_school_shooting
2019_townsville_flood
2020_easter_tornado_outbreak
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------

category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    priorityVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            priority = tweetId2TRECPriorityCategory.get(tweetId)
            priorityVector.append(priority)
    category2GroundTruthPriority[categoryId] = priorityVector

category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    categoryScoreVector = []
    
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            if tweetId2RunPriorityCategory.get(tweetId):
                priority = tweetId2RunPriorityCategory.get(tweetId)
                priorityScore = tweetId2RunPriorityScore.get(tweetId)
            
                categoryVector.append(priority)
                categoryScoreVector.append(priorityScore)
            else:
                categoryVector.append("Low") # default to low priority
                categoryScoreVector.append(0.25)

    category2PredictedPriority[categoryId] = categoryVector
    category2PredictedPriorityScore[categoryId] = categoryScoreVector


# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories


# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------

import pandas as pd

def calc_dcg(scores, at_k=100):
    position = 1
    accumulator = 0.0
    for score in scores[:at_k]:

        numerator = 2 ** score - 1
        denom = np.log2(position + 1)

        accumulator += numerator / denom
        position += 1

    return accumulator

priority_map = {
    "Unknown": 1,
    "Low": 1,
    "Medium": 2,
    "High": 3,
    "Critical": 4,
}

at_k = 100

tweetId2TRECPriorityCategory_score = {
    k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
    tweetId2TRECPriorityCategory_score.values(),
    reverse=True
)

best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print(event)
    
    tweetId2TRECPriorityCategory_scores_sorted = sorted(
        [tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
        reverse=True
    )
    ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
    print("\tBest DCG:", ideal_dcg)
    best_dcg_per_event[event] = ideal_dcg
    
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()

# Code below calculates the DCG for a system's 
#  ranked priority tweets. We have to do some 
#  sampling here to break ties among tweets with
#  the same priority scores.

# Build a dataframe from the system's provided
#  priority scores, so we can identify what the
#  top-most priorities are and get a count of
#  the number of tweets in each priority bin.
priority_df = pd.DataFrame(
    [(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
    columns=["tweet_id", "priority"]
)

# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print("Event:", event)
    local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
    
    unique_scores = local_priority_df["priority"].value_counts()
    
    # Find the top priority scores that would be included
    #  in the necessary at_k values.
    total = 0
    top_keys = []
    candidates = {}
    for top in sorted(unique_scores.index, reverse=True):

        # We store this key, so we can go back and shuffle
        #. tweets with this score.
        top_keys.append(top)
        local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
        candidates[top] = list(local_restricted_df["tweet_id"])

        total += local_restricted_df.shape[0]

        # Once we have enough samples, stop.
        if ( total > at_k ):
            break

    # Now we generate distribution over the DCG for this
    #  system and do this a number of times to remove
    #  dependence on our selection of the top k tweets
    random_dcgs = []
    for i in range(100):

        local_tweet_ids = []
        for top in top_keys:
            this_top_tweets = candidates[top][:]
            np.random.shuffle(this_top_tweets)

            needed = at_k - len(local_tweet_ids)
            local_tweet_ids.extend(this_top_tweets[:needed])

        local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]

        random_dcgs.append(calc_dcg(local_scores))

    system_dcg = np.mean(random_dcgs)

    system_ndcg_ = system_dcg / best_dcg_per_event[event]
    print("\tnDCG:", system_ndcg_)
    system_dcg_per_event[event] = system_ndcg_
    
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")

2020_01_27_houston_explosion.2020
	Best DCG: 176.99559032459564
2020_02_10_mideast_tornadoes.day1_mississipi.2020
	Best DCG: 268.88459894996123
2020_02_10_mideast_tornadoes.day2_al.2020
	Best DCG: 270.1716952398847
2020_02_10_mideast_tornadoes.day3_md.2019
	Best DCG: 135.38775246204446
2020_05_06_tn_derecho.2020
	Best DCG: 167.06354661312534
brooklynblockparty_shooting.2019
	Best DCG: 179.1756130795261
2016_puttingal_temple
	Best DCG: 314.08006311421406
2017_12_04_thomas_wildfire.2017
	Best DCG: 300.71399384300895
2017_12_07_lilac_wildfire.2017
	Best DCG: 314.08006311421406
2018_07_23_klamathon_wildfire.2018
	Best DCG: 221.46334445469358
2018_08_05_holy_wildfire.2018
	Best DCG: 153.96993418707177
2018_11_07_Woolsey_wildfire.2018
	Best DCG: 175.67469323453255
2018_maryland_flood
	Best DCG: 285.7119531591263
2018_pittsburgh_synagogue_shooting
	Best DCG: 111.85075929877581
2019_03_01_alberta_wildfire.2019.v2
	Best DCG: 62.88708564345522
2019_08_25_hurricane_dorian.2019
	Best DCG: 146.57069611996656
2019_10_10_saddleridge_wildfire.2019
	Best DCG: 173.00802656786584
2019_10_25_kincade_wildfire.2019
	Best DCG: 314.08006311421406
2019_durham_gas_explosion
	Best DCG: 201.07148118577902
2019_saugus_high_school_shooting
	Best DCG: 314.08006311421406
2019_townsville_flood
	Best DCG: 314.08006311421406
2020_easter_tornado_outbreak
	Best DCG: 214.9714167256293
2020_tornado_outbreak_of_april
	Best DCG: 314.08006311421406
2020_tornado_outbreak_of_march
	Best DCG: 267.51977363880474
2020_visakhapatnam_gas_leak
	Best DCG: 314.08006311421406
tornado_outbreak_of_november_30_december_2018
	Best DCG: 314.08006311421406
Mean: 231.7589407554446

Event: 2020_01_27_houston_explosion.2020
	nDCG: 0.2521290663344169
Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020
	nDCG: 0.39187105517000154
Event: 2020_02_10_mideast_tornadoes.day2_al.2020
	nDCG: 0.36325143319422737
Event: 2020_02_10_mideast_tornadoes.day3_md.2019
	nDCG: 0.3072441473270432
Event: 2020_05_06_tn_derecho.2020
	nDCG: 0.3648905016173573
Event: brooklynblockparty_shooting.2019
	nDCG: 0.12473828382151113
Event: 2016_puttingal_temple
	nDCG: 0.23008515121464357
Event: 2017_12_04_thomas_wildfire.2017
	nDCG: 0.2394525689309033
Event: 2017_12_07_lilac_wildfire.2017
	nDCG: 0.2817468066690672
Event: 2018_07_23_klamathon_wildfire.2018
	nDCG: 0.3870334000141965
Event: 2018_08_05_holy_wildfire.2018
	nDCG: 0.3164778245171642
Event: 2018_11_07_Woolsey_wildfire.2018
	nDCG: 0.26544580765844905
Event: 2018_maryland_flood
	nDCG: 0.23952976256868794
Event: 2018_pittsburgh_synagogue_shooting
	nDCG: 0.85195518472338
Event: 2019_03_01_alberta_wildfire.2019.v2
	nDCG: 0.33713666033721296
Event: 2019_08_25_hurricane_dorian.2019
	nDCG: 0.38495712225970063
Event: 2019_10_10_saddleridge_wildfire.2019
	nDCG: 0.37578812483342056
Event: 2019_10_25_kincade_wildfire.2019
	nDCG: 0.31032533865009204
Event: 2019_durham_gas_explosion
	nDCG: 0.2461146602232315
Event: 2019_saugus_high_school_shooting
	nDCG: 0.20273427200876964
Event: 2019_townsville_flood
	nDCG: 0.433700903054961
Event: 2020_easter_tornado_outbreak
	nDCG: 0.29376269794729887
Event: 2020_tornado_outbreak_of_april
	nDCG: 0.3496272662692206
Event: 2020_tornado_outbreak_of_march
	nDCG: 0.19671763406883652
Event: 2020_visakhapatnam_gas_leak
	nDCG: 0.42679551379873104
Event: tornado_outbreak_of_november_30_december_2018
	nDCG: 0.584068211892998

System Event-Micro nDCG: 0.3368299768886739

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0

avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0

avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0

for categoryId in informationTypes2Index.keys():
    categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
    
    avgPrecision = avgPrecision + categoryPrecision
    avgRecall = avgRecall + categoryRecall
    avgF1 = avgF1 + categoryF1
    avgAccuracy = avgAccuracy + categoryAccuracy
    
    if any(categoryId in s for s in highImportCategories):
        avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
        avgRecallHigh = avgRecallHigh + categoryRecall
        avgF1High = avgF1High + categoryF1
        avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
    else:
        avgPrecisionLow = avgPrecisionLow + categoryPrecision
        avgRecallLow = avgRecallLow + categoryRecall
        avgF1Low = avgF1Low + categoryF1
        avgAccuracyLow = avgAccuracyLow + categoryAccuracy

numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
        
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))

print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))

print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")

Information Type Precision (positive class, multi-type, macro): 0.20055299282746117
Information Type Recall (positive class, multi-type, macro): 0.49657537195941076
Information Type F1 (positive class, multi-type, macro): 0.2575175230528391
Information Type Accuracy (overall, multi-type, macro): 0.8473964507637484
High Importance Information Type Precision (positive class, multi-type, macro): 0.13847359059256995
High Importance Information Type Recall (positive class, multi-type, macro): 0.5151469334598603
High Importance Information Type F1 (positive class, multi-type, macro): 0.2082992920427631
High Importance Information Type Accuracy (overall, multi-type, macro): 0.9117913913412007
Low Importance Information Type Precision (positive class, multi-type, macro): 0.22015701458584785
Low Importance Information Type Recall (positive class, multi-type, macro): 0.4907106683276899
Low Importance Information Type F1 (positive class, multi-type, macro): 0.273060122319179
Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8270612063708688

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here, 
# primaraly with respect to recall, as we want the user to 
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)

from sklearn.metrics import classification_report

perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")

for categoryId in informationTypes2Index.keys():
    target_names = ['Other Classes', categoryId]
    try:
        print(categoryId)
        print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))


        perTopicFile.write(categoryId+"\n")
        perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
        perTopicFile.write(""+"\n")
      
    except ValueError:
        print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")

CallToAction-Donations
                        precision    recall  f1-score   support

         Other Classes       1.00      0.98      0.99     55275
CallToAction-Donations       0.26      0.62      0.36       568

              accuracy                           0.98     55843
             macro avg       0.63      0.80      0.68     55843
          weighted avg       0.99      0.98      0.98     55843

CallToAction-MovePeople
                         precision    recall  f1-score   support

          Other Classes       0.99      0.95      0.97     54646
CallToAction-MovePeople       0.21      0.56      0.31      1197

               accuracy                           0.95     55843
              macro avg       0.60      0.76      0.64     55843
           weighted avg       0.97      0.95      0.96     55843

CallToAction-Volunteer
                        precision    recall  f1-score   support

         Other Classes       1.00      0.96      0.98     55543
CallToAction-Volunteer       0.07      0.57      0.13       300

              accuracy                           0.96     55843
             macro avg       0.54      0.76      0.55     55843
          weighted avg       0.99      0.96      0.97     55843

Other-Advice
               precision    recall  f1-score   support

Other Classes       0.97      0.81      0.89     52602
 Other-Advice       0.17      0.63      0.27      3241

     accuracy                           0.80     55843
    macro avg       0.57      0.72      0.58     55843
 weighted avg       0.93      0.80      0.85     55843

Other-ContextualInformation
                             precision    recall  f1-score   support

              Other Classes       0.97      0.97      0.97     54346
Other-ContextualInformation       0.02      0.03      0.03      1497

                   accuracy                           0.94     55843
                  macro avg       0.50      0.50      0.50     55843
               weighted avg       0.95      0.94      0.94     55843

Other-Discussion
                  precision    recall  f1-score   support

   Other Classes       0.99      0.95      0.97     55263
Other-Discussion       0.02      0.11      0.04       580

        accuracy                           0.94     55843
       macro avg       0.51      0.53      0.50     55843
    weighted avg       0.98      0.94      0.96     55843

Other-Irrelevant
                  precision    recall  f1-score   support

   Other Classes       0.63      0.79      0.70     23267
Other-Irrelevant       0.82      0.66      0.73     32576

        accuracy                           0.72     55843
       macro avg       0.72      0.73      0.71     55843
    weighted avg       0.74      0.72      0.72     55843

Other-Sentiment
                 precision    recall  f1-score   support

  Other Classes       0.93      0.96      0.94     51270
Other-Sentiment       0.33      0.24      0.27      4573

       accuracy                           0.90     55843
      macro avg       0.63      0.60      0.61     55843
   weighted avg       0.88      0.90      0.89     55843

Report-CleanUp
                precision    recall  f1-score   support

 Other Classes       1.00      0.80      0.89     55581
Report-CleanUp       0.01      0.65      0.03       262

      accuracy                           0.80     55843
     macro avg       0.51      0.72      0.46     55843
  weighted avg       0.99      0.80      0.88     55843

Report-EmergingThreats
                        precision    recall  f1-score   support

         Other Classes       0.98      0.80      0.88     52454
Report-EmergingThreats       0.18      0.71      0.29      3389

              accuracy                           0.79     55843
             macro avg       0.58      0.75      0.59     55843
          weighted avg       0.93      0.79      0.84     55843

Report-Factoid
                precision    recall  f1-score   support

 Other Classes       0.95      0.90      0.93     49844
Report-Factoid       0.42      0.59      0.49      5999

      accuracy                           0.87     55843
     macro avg       0.69      0.75      0.71     55843
  weighted avg       0.89      0.87      0.88     55843

Report-FirstPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.97      0.92      0.95     54135
Report-FirstPartyObservation       0.07      0.18      0.10      1708

                    accuracy                           0.90     55843
                   macro avg       0.52      0.55      0.52     55843
                weighted avg       0.95      0.90      0.92     55843

Report-Hashtags
                 precision    recall  f1-score   support

  Other Classes       0.89      0.66      0.76     48407
Report-Hashtags       0.18      0.49      0.27      7436

       accuracy                           0.64     55843
      macro avg       0.54      0.58      0.51     55843
   weighted avg       0.80      0.64      0.70     55843

Report-Location
                 precision    recall  f1-score   support

  Other Classes       0.84      0.73      0.78     41325
Report-Location       0.44      0.61      0.51     14518

       accuracy                           0.70     55843
      macro avg       0.64      0.67      0.64     55843
   weighted avg       0.74      0.70      0.71     55843

Report-MultimediaShare
                        precision    recall  f1-score   support

         Other Classes       0.93      0.70      0.80     48784
Report-MultimediaShare       0.24      0.63      0.34      7059

              accuracy                           0.70     55843
             macro avg       0.58      0.67      0.57     55843
          weighted avg       0.84      0.70      0.74     55843

Report-News
               precision    recall  f1-score   support

Other Classes       0.96      0.71      0.82     50324
  Report-News       0.22      0.72      0.33      5519

     accuracy                           0.71     55843
    macro avg       0.59      0.72      0.57     55843
 weighted avg       0.89      0.71      0.77     55843

Report-NewSubEvent
                    precision    recall  f1-score   support

     Other Classes       0.99      0.88      0.93     54728
Report-NewSubEvent       0.07      0.44      0.12      1115

          accuracy                           0.87     55843
         macro avg       0.53      0.66      0.52     55843
      weighted avg       0.97      0.87      0.91     55843

Report-Official
                 precision    recall  f1-score   support

  Other Classes       0.98      0.75      0.85     53203
Report-Official       0.11      0.65      0.19      2640

       accuracy                           0.74     55843
      macro avg       0.54      0.70      0.52     55843
   weighted avg       0.94      0.74      0.81     55843

Report-OriginalEvent
                      precision    recall  f1-score   support

       Other Classes       0.95      0.92      0.94     52838
Report-OriginalEvent       0.14      0.23      0.17      3005

            accuracy                           0.88     55843
           macro avg       0.55      0.57      0.56     55843
        weighted avg       0.91      0.88      0.90     55843

Report-ServiceAvailable
                         precision    recall  f1-score   support

          Other Classes       0.98      0.94      0.96     53834
Report-ServiceAvailable       0.26      0.52      0.35      2009

               accuracy                           0.93     55843
              macro avg       0.62      0.73      0.65     55843
           weighted avg       0.96      0.93      0.94     55843

Report-ThirdPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.94      0.78      0.85     50379
Report-ThirdPartyObservation       0.21      0.52      0.30      5464

                    accuracy                           0.76     55843
                   macro avg       0.57      0.65      0.58     55843
                weighted avg       0.87      0.76      0.80     55843

Report-Weather
                precision    recall  f1-score   support

 Other Classes       0.97      0.88      0.93     50824
Report-Weather       0.39      0.76      0.52      5019

      accuracy                           0.87     55843
     macro avg       0.68      0.82      0.72     55843
  weighted avg       0.92      0.87      0.89     55843

Request-GoodsServices
                       precision    recall  f1-score   support

        Other Classes       1.00      0.96      0.98     55452
Request-GoodsServices       0.07      0.38      0.12       391

             accuracy                           0.96     55843
            macro avg       0.53      0.67      0.55     55843
         weighted avg       0.99      0.96      0.97     55843

Request-InformationWanted
                           precision    recall  f1-score   support

            Other Classes       0.99      0.92      0.96     55241
Request-InformationWanted       0.06      0.43      0.10       602

                 accuracy                           0.92     55843
                macro avg       0.53      0.68      0.53     55843
             weighted avg       0.98      0.92      0.95     55843

Request-SearchAndRescue
                         precision    recall  f1-score   support

          Other Classes       1.00      0.98      0.99     55737
Request-SearchAndRescue       0.04      0.49      0.07       106

               accuracy                           0.97     55843
              macro avg       0.52      0.73      0.53     55843
           weighted avg       1.00      0.97      0.99     55843

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)



N = len(informationTypes2Index)
ind = np.arange(N)

scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    print(categoryId, localF1Score)
    scoresPerCategoryF1.append(localF1Score)
    categoryLabels.append(categoryId)
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerCategoryF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

CallToAction-Donations 0.3620422898401238
CallToAction-MovePeople 0.3063810181985718
CallToAction-Volunteer 0.1287878787878788
Other-Advice 0.27147107438016527
Other-ContextualInformation 0.027243115190997928
Other-Discussion 0.03633314700950251
Other-Irrelevant 0.7304507195221286
Other-Sentiment 0.27492715063980744
Report-CleanUp 0.028903711304942705
Report-EmergingThreats 0.29268888075186134
Report-Factoid 0.49172081324669886
Report-FirstPartyObservation 0.09974667511082963
Report-Hashtags 0.2669488433189812
Report-Location 0.5106542271366044
Report-MultimediaShare 0.3442376603860825
Report-News 0.3314475873544093
Report-NewSubEvent 0.11747919143876337
Report-Official 0.19145490086567998
Report-OriginalEvent 0.17373634747269492
Report-ServiceAvailable 0.3461922181576322
Report-ThirdPartyObservation 0.2958276691414828
Report-Weather 0.519815418023887
Request-GoodsServices 0.1184528605962933
Request-InformationWanted 0.10239309533150255
Request-SearchAndRescue 0.06860158311345647


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)

perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")

for eventId in eventIdentifiers:
    tavgPrecision = 0.0
    tavgRecall = 0.0
    tavgF1 = 0.0
    tavgAccuracy = 0.0

    categoryCount = 0
    
    for categoryId in informationTypes2Index.keys():
        if sum(event2groundtruth[eventId].get(categoryId)) == 0:
            continue
        
        categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
        
        tavgPrecision = tavgPrecision + categoryPrecision
        tavgRecall = tavgRecall + categoryRecall
        tavgF1 = tavgF1 + categoryF1
        tavgAccuracy = tavgAccuracy + categoryAccuracy
        
        categoryCount += 1
    
    if categoryCount == 0:
        print("No categories for event:", eventId)
        continue
    
    print(eventId)
    print("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
    print("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
    print("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
    print("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
    print("")
    
    perEventFile.write(eventId+"\n")
    perEventFile.write("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
    perEventFile.write("\n")
    
perEventFile.write("\n")

2020_01_27_houston_explosion.2020
  Information Type Precision (positive class, multi-type, macro): 0.17087550157900305
  Information Type Recall (positive class, multi-type, macro): 0.46878749672493863
  Information Type F1 (positive class, multi-type, macro): 0.2133156212400501
  Information Type Accuracy (overall, multi-type, macro): 0.8485027611417904

2020_02_10_mideast_tornadoes.day1_mississipi.2020
  Information Type Precision (positive class, multi-type, macro): 0.3841664427918161
  Information Type Recall (positive class, multi-type, macro): 0.617272080382134
  Information Type F1 (positive class, multi-type, macro): 0.4121225378184968
  Information Type Accuracy (overall, multi-type, macro): 0.7091097308488613

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2020_02_10_mideast_tornadoes.day2_al.2020
  Information Type Precision (positive class, multi-type, macro): 0.2124431276170895
  Information Type Recall (positive class, multi-type, macro): 0.5355697086643229
  Information Type F1 (positive class, multi-type, macro): 0.26385366038592495
  Information Type Accuracy (overall, multi-type, macro): 0.8607381492687846

2020_02_10_mideast_tornadoes.day3_md.2019
  Information Type Precision (positive class, multi-type, macro): 0.13035178508785672
  Information Type Recall (positive class, multi-type, macro): 0.6683293138271014
  Information Type F1 (positive class, multi-type, macro): 0.17081506589760942
  Information Type Accuracy (overall, multi-type, macro): 0.8149431818181818

2020_05_06_tn_derecho.2020
  Information Type Precision (positive class, multi-type, macro): 0.22328019510892208
  Information Type Recall (positive class, multi-type, macro): 0.5057326419844612
  Information Type F1 (positive class, multi-type, macro): 0.25845227677040944
  Information Type Accuracy (overall, multi-type, macro): 0.8469584245076587

brooklynblockparty_shooting.2019
  Information Type Precision (positive class, multi-type, macro): 0.21104361101684735
  Information Type Recall (positive class, multi-type, macro): 0.42316478342334707
  Information Type F1 (positive class, multi-type, macro): 0.2165237184907205
  Information Type Accuracy (overall, multi-type, macro): 0.9090041361756284

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2016_puttingal_temple
  Information Type Precision (positive class, multi-type, macro): 0.1648629016679452
  Information Type Recall (positive class, multi-type, macro): 0.32077840313908146
  Information Type F1 (positive class, multi-type, macro): 0.17599446535043303
  Information Type Accuracy (overall, multi-type, macro): 0.8637778100916098

2017_12_04_thomas_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.2142825559478267
  Information Type Recall (positive class, multi-type, macro): 0.49101063121936206
  Information Type F1 (positive class, multi-type, macro): 0.2664328926428016
  Information Type Accuracy (overall, multi-type, macro): 0.7911096389532958

2017_12_07_lilac_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.22968116279426837
  Information Type Recall (positive class, multi-type, macro): 0.5106167943287692
  Information Type F1 (positive class, multi-type, macro): 0.2773451422242404
  Information Type Accuracy (overall, multi-type, macro): 0.8079723899913718

2018_07_23_klamathon_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.26766106246131244
  Information Type Recall (positive class, multi-type, macro): 0.5082315563647082
  Information Type F1 (positive class, multi-type, macro): 0.2953607799708363
  Information Type Accuracy (overall, multi-type, macro): 0.8045812848471532

2018_08_05_holy_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.16333947181521682
  Information Type Recall (positive class, multi-type, macro): 0.6387355674914105
  Information Type F1 (positive class, multi-type, macro): 0.2040389476619656
  Information Type Accuracy (overall, multi-type, macro): 0.9143421664342165

2018_11_07_Woolsey_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.14997437526837767
  Information Type Recall (positive class, multi-type, macro): 0.43843167705392005
  Information Type F1 (positive class, multi-type, macro): 0.18310243084696703
  Information Type Accuracy (overall, multi-type, macro): 0.8113168511430641

2018_maryland_flood
  Information Type Precision (positive class, multi-type, macro): 0.2373032633421701
  Information Type Recall (positive class, multi-type, macro): 0.5422747940206171
  Information Type F1 (positive class, multi-type, macro): 0.262179401117709
  Information Type Accuracy (overall, multi-type, macro): 0.8064468321600592

2018_pittsburgh_synagogue_shooting
  Information Type Precision (positive class, multi-type, macro): 0.36244294013901857
  Information Type Recall (positive class, multi-type, macro): 0.42030651340996167
  Information Type F1 (positive class, multi-type, macro): 0.3692284666036876
  Information Type Accuracy (overall, multi-type, macro): 0.7542735042735044

2019_03_01_alberta_wildfire.2019.v2
  Information Type Precision (positive class, multi-type, macro): 0.09543739016762737
  Information Type Recall (positive class, multi-type, macro): 0.465186263325597
  Information Type F1 (positive class, multi-type, macro): 0.07830051051177364
  Information Type Accuracy (overall, multi-type, macro): 0.834233810977997

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_08_25_hurricane_dorian.2019
  Information Type Precision (positive class, multi-type, macro): 0.17686498410344328
  Information Type Recall (positive class, multi-type, macro): 0.46117097501206034
  Information Type F1 (positive class, multi-type, macro): 0.19742631714561665
  Information Type Accuracy (overall, multi-type, macro): 0.7831648715824361

2019_10_10_saddleridge_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.24756125699426482
  Information Type Recall (positive class, multi-type, macro): 0.5770035825357456
  Information Type F1 (positive class, multi-type, macro): 0.2771912999875405
  Information Type Accuracy (overall, multi-type, macro): 0.86521217808362

2019_10_25_kincade_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.21151709779177946
  Information Type Recall (positive class, multi-type, macro): 0.5622836913273612
  Information Type F1 (positive class, multi-type, macro): 0.2634782608271622
  Information Type Accuracy (overall, multi-type, macro): 0.8355860554488059

2019_durham_gas_explosion
  Information Type Precision (positive class, multi-type, macro): 0.22687807218142872
  Information Type Recall (positive class, multi-type, macro): 0.4673077197690041
  Information Type F1 (positive class, multi-type, macro): 0.2656073318349859
  Information Type Accuracy (overall, multi-type, macro): 0.8395776566757491

2019_saugus_high_school_shooting
  Information Type Precision (positive class, multi-type, macro): 0.2182932122142716
  Information Type Recall (positive class, multi-type, macro): 0.23034182842435993
  Information Type F1 (positive class, multi-type, macro): 0.19072532844146953
  Information Type Accuracy (overall, multi-type, macro): 0.8992222129296649

2019_townsville_flood
  Information Type Precision (positive class, multi-type, macro): 0.2297671507548774
  Information Type Recall (positive class, multi-type, macro): 0.551911862262854
  Information Type F1 (positive class, multi-type, macro): 0.2678095728416979
  Information Type Accuracy (overall, multi-type, macro): 0.7838111478205266

2020_easter_tornado_outbreak
  Information Type Precision (positive class, multi-type, macro): 0.14593035703644255
  Information Type Recall (positive class, multi-type, macro): 0.5668682728409737
  Information Type F1 (positive class, multi-type, macro): 0.17415554387368212
  Information Type Accuracy (overall, multi-type, macro): 0.7972553973357833

2020_tornado_outbreak_of_april
  Information Type Precision (positive class, multi-type, macro): 0.2101013247748719
  Information Type Recall (positive class, multi-type, macro): 0.5188508659075042
  Information Type F1 (positive class, multi-type, macro): 0.24615600845455557
  Information Type Accuracy (overall, multi-type, macro): 0.809424144986846

2020_tornado_outbreak_of_march
  Information Type Precision (positive class, multi-type, macro): 0.17140154086538936
  Information Type Recall (positive class, multi-type, macro): 0.6018108816420129
  Information Type F1 (positive class, multi-type, macro): 0.21637268023200062
  Information Type Accuracy (overall, multi-type, macro): 0.7965002382591201

2020_visakhapatnam_gas_leak
  Information Type Precision (positive class, multi-type, macro): 0.2426888137748399
  Information Type Recall (positive class, multi-type, macro): 0.25247231341418735
  Information Type F1 (positive class, multi-type, macro): 0.16773433121987766
  Information Type Accuracy (overall, multi-type, macro): 0.8147957568081062

tornado_outbreak_of_november_30_december_2018
  Information Type Precision (positive class, multi-type, macro): 0.21481443662908753
  Information Type Recall (positive class, multi-type, macro): 0.6002203568019902
  Information Type F1 (positive class, multi-type, macro): 0.2595363885974751
  Information Type Accuracy (overall, multi-type, macro): 0.8713050811722539

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)

N = len(eventIdentifiers)
ind = np.arange(N)

scoresPerEventF1 = []
for eventId in eventIdentifiers:
    avgF1_ = 0.0
    
    for categoryId in informationTypes2Index.keys():
        avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        
    scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerEventF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)

from sklearn.metrics import mean_squared_error

priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]

    f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    priorityAvgf1 = priorityAvgf1 + f1;
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgf1High = priorityAvgf1High + f1
    else:
        priorityAvgf1Low = priorityAvgf1Low + f1
    
    
    
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
    
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")

Priority Label Prediction (F1, macro): 0.17120951398373774

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases

priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
    if categoryId == "Other-Irrelevant":
        continue
        
    groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
    predictedPriorities = category2PredictedPriorityScore[categoryId]

    # Pathological case when no variation exists in the predictions needs to be handled
    this_corr = 0.0
    if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
        this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
    priorityAvgCorr = priorityAvgCorr + this_corr
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
    else:
        priorityAvgCorrLow = priorityAvgCorrLow + this_corr
    
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))


resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")

Priority Score Prediction (Pearson): 0.1475708306012338
Priority Score Prediction, High (Pearson): 0.10963466884896583
Priority Score Prediction, Low (Pearson): 0.16021621785198978

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)

N = len(informationTypes2Index)
ind = np.arange(N)

priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]
    priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    if (math.isnan(priorityCatF1)):
        priorityCatF1 = 0.0
    categoryLabels.append(categoryId)
    priorityCatF1Values.append(priorityCatF1);
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, priorityCatF1Values, width)

plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()


resultLine = None

# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")

resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
     " & "+
     str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgF1/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
     " & "+
     str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
     " \\\\")

print(runName+" & "+resultLine)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")

Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\
STrans-GaussianNB & 0.3368 & 0.2083 & 0.2575 & 0.8474 & 0.1959 & 0.1712 & 0.1096 & 0.1417 \\

93


# Done
resultsFile.close() 
perTopicFile.close()
perEventFile.close()


# header = [
#     "Run",
#     "date",
#     "team",
#     "description",
#     "paper",
#     "code",
#     "nDCG@100",
#     "Info-Type F1 [Actionable]",
#     "Info-Type F1 [All]",
#     "Info-Type Accuracy",
#     "Priority F1 [Actionable]",
#     "Priority F1 [All]",
#     "Priority R [Actionable]",
#     "Priority R [All]",
# ]

import csv
if os.path.isfile("metadata.json"):
    this_cwd = os.getcwd()
    sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
    sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
    
    leaderboard_entry = None
    with open("metadata.json", "r") as in_file:
        
        metadata = json.load(in_file)
        
        leaderboard_entry = [
            runName,
            sub_date,
            metadata["organization"].lower(),
            metadata["model_description"],
            metadata["paper"] if metadata["paper"].startswith("http") else "",
            metadata["code"] if metadata["code"].startswith("http") else "",
            str.format('{0:.4f}',system_ndcg_micro),
            str.format('{0:.4f}',avgF1High/numHighInformationTypes),
            str.format('{0:.4f}',avgF1/numInformationTypes),
            str.format('{0:.4f}',avgAccuracy/numInformationTypes),
            str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
            str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
        ]
        
    with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
        leader_writer = csv.writer(csvResultsFile)
        leader_writer.writerow(leaderboard_entry)