# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"

import os
cwd = os.getcwd()


# Configuration Information

# Do we try and normalize the run priority scores?
enablePriorityNorm = True

# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5

taskCategories = [
    "CallToAction-Donations",
    "CallToAction-MovePeople",
    "CallToAction-Volunteer",
    "Other-Advice",
    "Other-ContextualInformation",
    "Other-Discussion",
    "Other-Irrelevant",
    "Other-Sentiment",
    "Report-CleanUp",
    "Report-EmergingThreats",
    "Report-Factoid",
    "Report-FirstPartyObservation",
    "Report-Hashtags",
    "Report-Location",
    "Report-MultimediaShare",
    "Report-News",
    "Report-NewSubEvent",
    "Report-Official",
    "Report-OriginalEvent",
    "Report-ServiceAvailable",
    "Report-ThirdPartyObservation",
    "Report-Weather",
    "Request-GoodsServices",
    "Request-InformationWanted",
    "Request-SearchAndRescue",
]

# What we consider to be highly important categories of information
highImportCategories = [
    "Request-GoodsServices",
    "Request-SearchAndRescue",
    "CallToAction-MovePeople",
    "Report-EmergingThreats",
    "Report-NewSubEvent",
    "Report-ServiceAvailable"
]

highImportCategoriesShort = [
    "GoodsServices",
    "SearchAndRescue",
    "MovePeople",
    "EmergingThreats",
    "NewSubEvent",
    "ServiceAvailable"
]

# Priority map
priorityScoreMap = {
    "Critical": 1.0,
    "High": 0.75,
    "Medium": 0.5,
    "Low": 0.25,
    "Unknown": 0.25,
}

# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected


# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
#     '2015_09_28_hurricane_joaquin.2015',
#     '2017_03_23_cyclone_debbie.2017',
#     '2018_02_24_anticyclone_hartmut.2018',
#     '2018_07_13_ferguson_wildfire.2018',
#     '2018_07_23_cranston_wildfire.2018',
#     '2018_09_07_hurricane_florence.2018',
#     '2018_10_07_hurricane_michael.2018',
#     '2019_09_17_tropicalstorm_imelda.2019',
#     '2019_karnataka_floods',
#     '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
#     '2020_01_28_bar_shooting_nc.2020',
#     '2020_02_07_rutherford_tn_floods.2020',
#     '2020_05_26_edenville_dam_failure.2020.corrected',
#     '2020_08_27_hurricane_laura.2020',
#     '2020_09_11_hurricane_sally.2020',
#     '2020_afghanistan_flood',
#     '2020_hpakant_jade_mine_disaster',
#     '2020_kerala_floods',
#     'T2020_02_03_texas_university_shooting.2020',
#     'UNASSIGNED',
#     'indonesia_earthquake.2019'
    
    "2020_05_26_edenville_dam_failure.2020.corrected",
    "2018_10_07_hurricane_michael.2018",
    "2020_01_28_bar_shooting_nc.2020",
    "T2020_02_03_texas_university_shooting.2020",
    "2020_02_07_rutherford_tn_floods.2020",
    "UNASSIGNED",
    "indonesia_earthquake.2019",
    "2015_09_28_hurricane_joaquin.2015",
    "2017_03_23_cyclone_debbie.2017",
    "2018_02_24_anticyclone_hartmut.2018",
    "2018_07_13_ferguson_wildfire.2018",
    "2018_07_23_cranston_wildfire.2018",
    "2018_09_07_hurricane_florence.2018",
    "2019_09_17_tropicalstorm_imelda.2019",
    "2019_karnataka_floods",
    "2019_spring_floods_in_ontario_quebec_and_new_brunswick",
    "2020_08_27_hurricane_laura.2020",
    "2020_09_11_hurricane_sally.2020",
    "2020_afghanistan_flood",
    "2020_hpakant_jade_mine_disaster",
    "2020_kerala_floods",
]


import glob

runFile = None
for f in glob.glob("*.gz"):
    runFile = f

print("Run File:", f)

Run File: njit_augly.run.json.gz


import gzip
import json


runName = None

with gzip.open(runFile, "r") as inRunFile:
    for line in inRunFile:
        line = line.decode("utf8")
#         runName = line.rpartition("\t")[2].strip()
        runName = json.loads(line)["runtag"]
        break

print("Run Name:", runName)

Run Name: njit_augly


# Do we try and normalize the run priority scores?
enablePriorityNorm = False

dataDir = "../../data/2021b"

# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir

# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
#     "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
#     "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
#     "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
    "%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]

# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir


topicArray = []

with open(topicsFile, "r") as inTopicsFile:
    
    topicNum = None
    topicDataset = None
    
    for line_ in inTopicsFile:
        line = line_.strip()
        
        if line == "</top>":
            if topicDataset in skipEvents:
                continue
            topicArray.append((topicDataset, topicNum))
            
        if line.startswith("<num>"):
            topicNum = line.partition("<num>")[2].partition("</num>")[0]
            
        if line.startswith("<dataset>"):
            topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
            
for row in topicArray:
    print(row)

('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076')
('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080')
('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081')
('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082')
('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083')
('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085')
('2016_puttingal_temple', 'TRECIS-CTIT-H-089')
('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091')
('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092')
('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096')
('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097')
('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100')
('2018_maryland_flood', 'TRECIS-CTIT-H-101')
('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102')
('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103')
('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104')
('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106')
('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107')
('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108')
('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110')
('2019_townsville_flood', 'TRECIS-CTIT-H-112')
('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116')
('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119')
('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120')
('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121')
('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')


# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())

resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")

perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")

perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")

1


# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# --------------------------------------------------
# Stage 1: Load the ground truth dataset 
# --------------------------------------------------

groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
    print("Reading "+groundtruthFile)
    with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:    
        groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])

# --------------------------------------------------
# Stage 2: Load run file 
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
#     runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
    runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])

Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json


# --------------------------------------------------
# Stage 3: Load the categories 
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:    
    ontologyJSON = json.load(ontologyJSONFile)

informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index

for informationTypeJSON in ontologyJSON["informationTypes"]:
    informationTypeId = informationTypeJSON["id"]
    
    informationTypeIndex = taskCategories.index(informationTypeId)
    informationTypes2Index[informationTypeId] = informationTypeIndex
    informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex


# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
#        not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events

tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []


invertedPriorityScoreMap = {
    v:k for k,v in priorityScoreMap.items()
}

tweetIndex = 0
for groundtruth in groundtruthJSON:
    for eventJSON in groundtruth["events"]:
        eventid = eventJSON["eventid"]
        print(eventid)
        
        if eventid in skipEvents:
            continue
        
        if not event2tweetIds.get(eventid):
            event2tweetIds[eventid] = []
        
        if any(eventid in s for s in eventIdentifiers):
            # iterate over tweets in the event
            for tweetJSON in eventJSON["tweets"]:
                tweetid = tweetJSON["postID"]
                categories = tweetJSON["postCategories"]
                priority = tweetJSON["postPriority"]
                
                if priority == "High" or priority == "Critical":
                    countHighCriticalImport = countHighCriticalImport + 1
                
                if priority == "Low" or priority == "Medium":
                    countLowMediumImport = countLowMediumImport + 1
                
                # check categories for name issues and correct if possible
                cleanedCategories = []
                highImportCats = []
                lowImportCats = []
                for categoryId in categories:
                    if not any(categoryId in s for s in informationTypesShort2Index.keys()):
#                         print("Found unknown category in ground truth "+categoryId+", ignoring...")
                        pass
                    else:
                        cleanedCategories.append(categoryId)
                        if any(categoryId in s for s in highImportCategoriesShort):
                            highImportCats.append(categoryId)
                        else:
                            lowImportCats.append(categoryId)
    
                if tweetid not in tweetsSeen:
                    event2tweetIds[eventid].append(tweetid)
                    tweetId2TRECInfoCategories[tweetid] = cleanedCategories
                    tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
                    tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
                    tweetId2TRECPriorityCategory[tweetid] = priority
                    index2TweetId[tweetIndex] = tweetid;
                    tweetIndex = tweetIndex + 1
                    tweetsSeen.append(tweetid)

                else:
                    tweetId2TRECInfoCategories[tweetid] = list(set(
                        cleanedCategories + tweetId2TRECInfoCategories[tweetid]
                    ))
                    
                    prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
                    thisPriorityScore = priorityScoreMap[priority]
                    
                    tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
                        max(prePriorityScore, thisPriorityScore)
                    ]

                
        else:
            print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")

2020_01_27_houston_explosion.2020
2020_01_28_bar_shooting_nc.2020
T2020_02_03_texas_university_shooting.2020
2020_02_07_rutherford_tn_floods.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
2020_05_26_edenville_dam_failure.2020.corrected
brooklynblockparty_shooting.2019
UNASSIGNED
indonesia_earthquake.2019
2015_09_28_hurricane_joaquin.2015
2016_puttingal_temple
2017_03_23_cyclone_debbie.2017
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_02_24_anticyclone_hartmut.2018
2018_07_13_ferguson_wildfire.2018
2018_07_23_cranston_wildfire.2018
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_09_07_hurricane_florence.2018
2018_10_07_hurricane_michael.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_09_17_tropicalstorm_imelda.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_karnataka_floods
2019_saugus_high_school_shooting
2019_spring_floods_in_ontario_quebec_and_new_brunswick
2019_townsville_flood
2020_08_27_hurricane_laura.2020
2020_09_11_hurricane_sally.2020
2020_afghanistan_flood
2020_easter_tornado_outbreak
2020_hpakant_jade_mine_disaster
2020_kerala_floods
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)

maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999

for predictionParts in runContents:
    
    #print(runLine)
    if (len(predictionParts)<6 ):
        print(runLine)
        continue
    else:
        eventId = predictionParts["topic"]
        
        if eventId in skipEvents:
            continue
        
        tweetId = predictionParts["tweet_id"]
        rank = 0
        #print(predictionParts[5])

        category_scores = predictionParts["info_type_scores"]
        category_labels = predictionParts["info_type_labels"]

        priority = float(predictionParts["priority"])
        
        if priority > maxPrediction:
            maxPrediction = priority
        if priority < minPrediction:
            minPrediction = priority
        
        cleanedCategories = []
        cleanedCategoriesProbs = []
        highImportCats = []
        lowImportCats = []
        
        # Handle category flags
        for catIndex, categoryLabel in enumerate(category_labels):
            # check if we have a binary flag for this label
            if categoryLabel == 0:
                # False flag, so skip
                continue
                
            categoryId = taskCategories[catIndex]
            
            if not any(categoryId in s for s in informationTypes2Index.keys()):
                print("Found unknown category in run "+categoryId+", ignoring...")
            else:
                cleanedCategories.append(categoryId)
                if any(categoryId in s for s in highImportCategories):
                    highImportCats.append(categoryId)
                else:
                    lowImportCats.append(categoryId)
                    
        # Process category probabilities
        for categoryProbability in category_scores:
            
            if categoryProbability > maxCategory:
                maxCategory = categoryProbability
            if categoryProbability < minCategory:
                minCategory = categoryProbability
            
            cleanedCategoriesProbs.append(categoryProbability)
                
        tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
        tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
        tweetId2RunInfoCategories[tweetId] = cleanedCategories
        tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
        tweetId2RunPriorityScore[tweetId] = priority
        
        if priority > priorityScoreMap["High"]:
            tweetId2RunPriorityCategory[tweetId] = "Critical"
        elif priority > priorityScoreMap["Medium"]:
            tweetId2RunPriorityCategory[tweetId] = "High"
        elif priority > priorityScoreMap["Low"]:
            tweetId2RunPriorityCategory[tweetId] = "Medium"
        else:
            tweetId2RunPriorityCategory[tweetId] = "Low"
        
        if not event2TweetIdRank.get(eventId):
            event2TweetIdRank[eventId] = []
        rankTuple = (tweetId,rank)
        event2TweetIdRank.get(eventId).append(rankTuple)


for eventId in event2TweetIdRank.keys():
    tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
    event2TweetIdRank[eventId] = tweetsSorted
    
for i in range(len(index2TweetId)):
    tweetId = index2TweetId[i]
    if tweetId2RunPriorityScore.get(tweetId):
        
        if enablePriorityNorm:
            if (minPrediction-minPrediction) == 0.0:
                tweetId2RunPriorityScoreNorm[tweetId] = 0.0
            else:
                tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
        else:
            tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
    else:
        tweetId2RunPriorityScoreNorm[tweetId] = 0.0


# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------

category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        #pprint(categories)
        if any(categoryIdShort in s for s in categories):
            categoryVector.append(1)
        else:
            categoryVector.append(0)
    category2GroundTruth[categoryId] = categoryVector
            
#pprint(category2GroundTruth)


# --------------------------------------------------
# Stage 7: Create run vectors per category 
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
#              no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        
        if tweetId2RunInfoCategories.get(tweetId):
            categories = tweetId2RunInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
        else:
            categoryVector.append(0)

    category2Predicted[categoryId] = categoryVector

#pprint(category2Predicted)


# --------------------------------------------------
# Stage 8: Make event category vectors 
# --------------------------------------------------

event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(eventId)
        for tweetId in event2tweetIds.get(eventId):
#             print(tweetId)
            categories = tweetId2TRECInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2groundtruth[eventId] = eventCategories
    

event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
    print(eventId)
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(tweetId)
        for tweetId in event2tweetIds.get(eventId):
            #print(tweetId)
            categories = tweetId2RunInfoCategories.get(tweetId)
            
            if categories == None:
                categories = json.loads("[]")
                tweetId2RunInfoCategories[tweetId] = categories
            
            if any(categoryId in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2prediction[eventId] = eventCategories

2020_01_27_houston_explosion.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
brooklynblockparty_shooting.2019
2016_puttingal_temple
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_saugus_high_school_shooting
2019_townsville_flood
2020_easter_tornado_outbreak
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------

category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    priorityVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            priority = tweetId2TRECPriorityCategory.get(tweetId)
            priorityVector.append(priority)
    category2GroundTruthPriority[categoryId] = priorityVector

category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    categoryScoreVector = []
    
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            if tweetId2RunPriorityCategory.get(tweetId):
                priority = tweetId2RunPriorityCategory.get(tweetId)
                priorityScore = tweetId2RunPriorityScore.get(tweetId)
            
                categoryVector.append(priority)
                categoryScoreVector.append(priorityScore)
            else:
                categoryVector.append("Low") # default to low priority
                categoryScoreVector.append(0.25)

    category2PredictedPriority[categoryId] = categoryVector
    category2PredictedPriorityScore[categoryId] = categoryScoreVector


# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories


# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------

import pandas as pd

def calc_dcg(scores, at_k=100):
    position = 1
    accumulator = 0.0
    for score in scores[:at_k]:

        numerator = 2 ** score - 1
        denom = np.log2(position + 1)

        accumulator += numerator / denom
        position += 1

    return accumulator

priority_map = {
    "Unknown": 1,
    "Low": 1,
    "Medium": 2,
    "High": 3,
    "Critical": 4,
}

at_k = 100

tweetId2TRECPriorityCategory_score = {
    k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
    tweetId2TRECPriorityCategory_score.values(),
    reverse=True
)

best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print(event)
    
    tweetId2TRECPriorityCategory_scores_sorted = sorted(
        [tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
        reverse=True
    )
    ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
    print("\tBest DCG:", ideal_dcg)
    best_dcg_per_event[event] = ideal_dcg
    
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()

# Code below calculates the DCG for a system's 
#  ranked priority tweets. We have to do some 
#  sampling here to break ties among tweets with
#  the same priority scores.

# Build a dataframe from the system's provided
#  priority scores, so we can identify what the
#  top-most priorities are and get a count of
#  the number of tweets in each priority bin.
priority_df = pd.DataFrame(
    [(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
    columns=["tweet_id", "priority"]
)

# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print("Event:", event)
    local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
    
    unique_scores = local_priority_df["priority"].value_counts()
    
    # Find the top priority scores that would be included
    #  in the necessary at_k values.
    total = 0
    top_keys = []
    candidates = {}
    for top in sorted(unique_scores.index, reverse=True):

        # We store this key, so we can go back and shuffle
        #. tweets with this score.
        top_keys.append(top)
        local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
        candidates[top] = list(local_restricted_df["tweet_id"])

        total += local_restricted_df.shape[0]

        # Once we have enough samples, stop.
        if ( total > at_k ):
            break

    # Now we generate distribution over the DCG for this
    #  system and do this a number of times to remove
    #  dependence on our selection of the top k tweets
    random_dcgs = []
    for i in range(100):

        local_tweet_ids = []
        for top in top_keys:
            this_top_tweets = candidates[top][:]
            np.random.shuffle(this_top_tweets)

            needed = at_k - len(local_tweet_ids)
            local_tweet_ids.extend(this_top_tweets[:needed])

        local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]

        random_dcgs.append(calc_dcg(local_scores))

    system_dcg = np.mean(random_dcgs)

    system_ndcg_ = system_dcg / best_dcg_per_event[event]
    print("\tnDCG:", system_ndcg_)
    system_dcg_per_event[event] = system_ndcg_
    
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")

2020_01_27_houston_explosion.2020
	Best DCG: 176.99559032459564
2020_02_10_mideast_tornadoes.day1_mississipi.2020
	Best DCG: 268.88459894996123
2020_02_10_mideast_tornadoes.day2_al.2020
	Best DCG: 270.1716952398847
2020_02_10_mideast_tornadoes.day3_md.2019
	Best DCG: 135.38775246204446
2020_05_06_tn_derecho.2020
	Best DCG: 167.06354661312534
brooklynblockparty_shooting.2019
	Best DCG: 179.1756130795261
2016_puttingal_temple
	Best DCG: 314.08006311421406
2017_12_04_thomas_wildfire.2017
	Best DCG: 300.71399384300895
2017_12_07_lilac_wildfire.2017
	Best DCG: 314.08006311421406
2018_07_23_klamathon_wildfire.2018
	Best DCG: 221.46334445469358
2018_08_05_holy_wildfire.2018
	Best DCG: 153.96993418707177
2018_11_07_Woolsey_wildfire.2018
	Best DCG: 175.67469323453255
2018_maryland_flood
	Best DCG: 285.7119531591263
2018_pittsburgh_synagogue_shooting
	Best DCG: 111.85075929877581
2019_03_01_alberta_wildfire.2019.v2
	Best DCG: 62.88708564345522
2019_08_25_hurricane_dorian.2019
	Best DCG: 146.57069611996656
2019_10_10_saddleridge_wildfire.2019
	Best DCG: 173.00802656786584
2019_10_25_kincade_wildfire.2019
	Best DCG: 314.08006311421406
2019_durham_gas_explosion
	Best DCG: 201.07148118577902
2019_saugus_high_school_shooting
	Best DCG: 314.08006311421406
2019_townsville_flood
	Best DCG: 314.08006311421406
2020_easter_tornado_outbreak
	Best DCG: 214.9714167256293
2020_tornado_outbreak_of_april
	Best DCG: 314.08006311421406
2020_tornado_outbreak_of_march
	Best DCG: 267.51977363880474
2020_visakhapatnam_gas_leak
	Best DCG: 314.08006311421406
tornado_outbreak_of_november_30_december_2018
	Best DCG: 314.08006311421406
Mean: 231.7589407554446

Event: 2020_01_27_houston_explosion.2020
	nDCG: 0.2779815008319921
Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020
	nDCG: 0.4258767313150799
Event: 2020_02_10_mideast_tornadoes.day2_al.2020
	nDCG: 0.4261105392068559
Event: 2020_02_10_mideast_tornadoes.day3_md.2019
	nDCG: 0.3302359837506091
Event: 2020_05_06_tn_derecho.2020
	nDCG: 0.47554704985063406
Event: brooklynblockparty_shooting.2019
	nDCG: 0.19540940967549325
Event: 2016_puttingal_temple
	nDCG: 0.27362712881070844
Event: 2017_12_04_thomas_wildfire.2017
	nDCG: 0.32874461218314044
Event: 2017_12_07_lilac_wildfire.2017
	nDCG: 0.36028923798961116
Event: 2018_07_23_klamathon_wildfire.2018
	nDCG: 0.5364057486064486
Event: 2018_08_05_holy_wildfire.2018
	nDCG: 0.45036941541273096
Event: 2018_11_07_Woolsey_wildfire.2018
	nDCG: 0.36213922950351773
Event: 2018_maryland_flood
	nDCG: 0.3187719181198949
Event: 2018_pittsburgh_synagogue_shooting
	nDCG: 0.7613156312809669
Event: 2019_03_01_alberta_wildfire.2019.v2
	nDCG: 0.3383141212128866
Event: 2019_08_25_hurricane_dorian.2019
	nDCG: 0.40570690309517676
Event: 2019_10_10_saddleridge_wildfire.2019
	nDCG: 0.5033998854930324
Event: 2019_10_25_kincade_wildfire.2019
	nDCG: 0.4850977139045199
Event: 2019_durham_gas_explosion
	nDCG: 0.2609681788775771
Event: 2019_saugus_high_school_shooting
	nDCG: 0.26865018514370226
Event: 2019_townsville_flood
	nDCG: 0.6905431750203319
Event: 2020_easter_tornado_outbreak
	nDCG: 0.4586172057343277
Event: 2020_tornado_outbreak_of_april
	nDCG: 0.4919510826705382
Event: 2020_tornado_outbreak_of_march
	nDCG: 0.20636105143971845
Event: 2020_visakhapatnam_gas_leak
	nDCG: 0.5381760403416509
Event: tornado_outbreak_of_november_30_december_2018
	nDCG: 0.8593485226180794

System Event-Micro nDCG: 0.4242291616188163

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0

avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0

avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0

for categoryId in informationTypes2Index.keys():
    categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
    
    avgPrecision = avgPrecision + categoryPrecision
    avgRecall = avgRecall + categoryRecall
    avgF1 = avgF1 + categoryF1
    avgAccuracy = avgAccuracy + categoryAccuracy
    
    if any(categoryId in s for s in highImportCategories):
        avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
        avgRecallHigh = avgRecallHigh + categoryRecall
        avgF1High = avgF1High + categoryF1
        avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
    else:
        avgPrecisionLow = avgPrecisionLow + categoryPrecision
        avgRecallLow = avgRecallLow + categoryRecall
        avgF1Low = avgF1Low + categoryF1
        avgAccuracyLow = avgAccuracyLow + categoryAccuracy

numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
        
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))

print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))

print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")

Information Type Precision (positive class, multi-type, macro): 0.23901433328840274
Information Type Recall (positive class, multi-type, macro): 0.3809030540831667
Information Type F1 (positive class, multi-type, macro): 0.27289932741237943
Information Type Accuracy (overall, multi-type, macro): 0.8810500868506348
High Importance Information Type Precision (positive class, multi-type, macro): 0.20228303843690656
High Importance Information Type Recall (positive class, multi-type, macro): 0.3634286123191017
High Importance Information Type F1 (positive class, multi-type, macro): 0.244061750587089
High Importance Information Type Accuracy (overall, multi-type, macro): 0.9516621002930835
Low Importance Information Type Precision (positive class, multi-type, macro): 0.2506136895572963
Low Importance Information Type Recall (positive class, multi-type, macro): 0.3864212988507662
Low Importance Information Type F1 (positive class, multi-type, macro): 0.28200593062036583
Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8587515562898613

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here, 
# primaraly with respect to recall, as we want the user to 
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)

from sklearn.metrics import classification_report

perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")

for categoryId in informationTypes2Index.keys():
    target_names = ['Other Classes', categoryId]
    try:
        print(categoryId)
        print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))


        perTopicFile.write(categoryId+"\n")
        perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
        perTopicFile.write(""+"\n")
      
    except ValueError:
        print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")

CallToAction-Donations
                        precision    recall  f1-score   support

         Other Classes       1.00      0.99      0.99     55275
CallToAction-Donations       0.29      0.58      0.38       568

              accuracy                           0.98     55843
             macro avg       0.64      0.78      0.69     55843
          weighted avg       0.99      0.98      0.98     55843

CallToAction-MovePeople
                         precision    recall  f1-score   support

          Other Classes       0.99      0.98      0.98     54646
CallToAction-MovePeople       0.32      0.46      0.38      1197

               accuracy                           0.97     55843
              macro avg       0.66      0.72      0.68     55843
           weighted avg       0.97      0.97      0.97     55843

CallToAction-Volunteer
                        precision    recall  f1-score   support

         Other Classes       1.00      0.99      0.99     55543
CallToAction-Volunteer       0.16      0.37      0.23       300

              accuracy                           0.99     55843
             macro avg       0.58      0.68      0.61     55843
          weighted avg       0.99      0.99      0.99     55843

Other-Advice
               precision    recall  f1-score   support

Other Classes       0.96      0.96      0.96     52602
 Other-Advice       0.37      0.37      0.37      3241

     accuracy                           0.93     55843
    macro avg       0.66      0.67      0.66     55843
 weighted avg       0.93      0.93      0.93     55843

Other-ContextualInformation
                             precision    recall  f1-score   support

              Other Classes       0.97      0.92      0.94     54346
Other-ContextualInformation       0.04      0.12      0.06      1497

                   accuracy                           0.90     55843
                  macro avg       0.51      0.52      0.50     55843
               weighted avg       0.95      0.90      0.92     55843

Other-Discussion
                  precision    recall  f1-score   support

   Other Classes       0.99      0.93      0.96     55263
Other-Discussion       0.03      0.19      0.05       580

        accuracy                           0.92     55843
       macro avg       0.51      0.56      0.50     55843
    weighted avg       0.98      0.92      0.95     55843

Other-Irrelevant
                  precision    recall  f1-score   support

   Other Classes       0.52      0.85      0.65     23267
Other-Irrelevant       0.81      0.44      0.57     32576

        accuracy                           0.61     55843
       macro avg       0.67      0.65      0.61     55843
    weighted avg       0.69      0.61      0.60     55843

Other-Sentiment
                 precision    recall  f1-score   support

  Other Classes       0.94      0.94      0.94     51270
Other-Sentiment       0.29      0.29      0.29      4573

       accuracy                           0.88     55843
      macro avg       0.61      0.61      0.61     55843
   weighted avg       0.88      0.88      0.88     55843

Report-CleanUp
                precision    recall  f1-score   support

 Other Classes       1.00      0.97      0.98     55581
Report-CleanUp       0.06      0.42      0.10       262

      accuracy                           0.97     55843
     macro avg       0.53      0.70      0.54     55843
  weighted avg       0.99      0.97      0.98     55843

Report-EmergingThreats
                        precision    recall  f1-score   support

         Other Classes       0.96      0.89      0.93     52454
Report-EmergingThreats       0.21      0.44      0.29      3389

              accuracy                           0.87     55843
             macro avg       0.59      0.67      0.61     55843
          weighted avg       0.92      0.87      0.89     55843

Report-Factoid
                precision    recall  f1-score   support

 Other Classes       0.94      0.92      0.93     49844
Report-Factoid       0.44      0.50      0.47      5999

      accuracy                           0.88     55843
     macro avg       0.69      0.71      0.70     55843
  weighted avg       0.89      0.88      0.88     55843

Report-FirstPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.97      0.92      0.94     54135
Report-FirstPartyObservation       0.08      0.23      0.12      1708

                    accuracy                           0.90     55843
                   macro avg       0.53      0.57      0.53     55843
                weighted avg       0.95      0.90      0.92     55843

Report-Hashtags
                 precision    recall  f1-score   support

  Other Classes       0.89      0.88      0.88     48407
Report-Hashtags       0.25      0.26      0.26      7436

       accuracy                           0.80     55843
      macro avg       0.57      0.57      0.57     55843
   weighted avg       0.80      0.80      0.80     55843

Report-Location
                 precision    recall  f1-score   support

  Other Classes       0.84      0.69      0.76     41325
Report-Location       0.42      0.64      0.50     14518

       accuracy                           0.67     55843
      macro avg       0.63      0.66      0.63     55843
   weighted avg       0.73      0.67      0.69     55843

Report-MultimediaShare
                        precision    recall  f1-score   support

         Other Classes       0.92      0.71      0.80     48784
Report-MultimediaShare       0.22      0.58      0.32      7059

              accuracy                           0.69     55843
             macro avg       0.57      0.65      0.56     55843
          weighted avg       0.83      0.69      0.74     55843

Report-News
               precision    recall  f1-score   support

Other Classes       0.94      0.78      0.85     50324
  Report-News       0.22      0.56      0.31      5519

     accuracy                           0.76     55843
    macro avg       0.58      0.67      0.58     55843
 weighted avg       0.87      0.76      0.80     55843

Report-NewSubEvent
                    precision    recall  f1-score   support

     Other Classes       0.98      0.97      0.97     54728
Report-NewSubEvent       0.05      0.09      0.07      1115

          accuracy                           0.95     55843
         macro avg       0.52      0.53      0.52     55843
      weighted avg       0.96      0.95      0.96     55843

Report-Official
                 precision    recall  f1-score   support

  Other Classes       0.96      0.96      0.96     53203
Report-Official       0.19      0.21      0.20      2640

       accuracy                           0.92     55843
      macro avg       0.58      0.58      0.58     55843
   weighted avg       0.92      0.92      0.92     55843

Report-OriginalEvent
                      precision    recall  f1-score   support

       Other Classes       0.95      0.97      0.96     52838
Report-OriginalEvent       0.10      0.06      0.08      3005

            accuracy                           0.92     55843
           macro avg       0.52      0.51      0.52     55843
        weighted avg       0.90      0.92      0.91     55843

Report-ServiceAvailable
                         precision    recall  f1-score   support

          Other Classes       0.98      0.98      0.98     53834
Report-ServiceAvailable       0.40      0.37      0.39      2009

               accuracy                           0.96     55843
              macro avg       0.69      0.68      0.68     55843
           weighted avg       0.96      0.96      0.96     55843

Report-ThirdPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.92      0.78      0.85     50379
Report-ThirdPartyObservation       0.17      0.40      0.24      5464

                    accuracy                           0.75     55843
                   macro avg       0.55      0.59      0.54     55843
                weighted avg       0.85      0.75      0.79     55843

Report-Weather
                precision    recall  f1-score   support

 Other Classes       0.97      0.90      0.93     50824
Report-Weather       0.41      0.69      0.51      5019

      accuracy                           0.88     55843
     macro avg       0.69      0.80      0.72     55843
  weighted avg       0.92      0.88      0.89     55843

Request-GoodsServices
                       precision    recall  f1-score   support

        Other Classes       1.00      0.98      0.99     55452
Request-GoodsServices       0.17      0.49      0.25       391

             accuracy                           0.98     55843
            macro avg       0.58      0.73      0.62     55843
         weighted avg       0.99      0.98      0.98     55843

Request-InformationWanted
                           precision    recall  f1-score   support

            Other Classes       0.99      0.98      0.99     55241
Request-InformationWanted       0.23      0.41      0.29       602

                 accuracy                           0.98     55843
                macro avg       0.61      0.70      0.64     55843
             weighted avg       0.99      0.98      0.98     55843

Request-SearchAndRescue
                         precision    recall  f1-score   support

          Other Classes       1.00      0.99      0.99     55737
Request-SearchAndRescue       0.06      0.33      0.10       106

               accuracy                           0.99     55843
              macro avg       0.53      0.66      0.54     55843
           weighted avg       1.00      0.99      0.99     55843

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)



N = len(informationTypes2Index)
ind = np.arange(N)

scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    print(categoryId, localF1Score)
    scoresPerCategoryF1.append(localF1Score)
    categoryLabels.append(categoryId)
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerCategoryF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

CallToAction-Donations 0.3818393480791618
CallToAction-MovePeople 0.38033012379642367
CallToAction-Volunteer 0.22626262626262622
Other-Advice 0.36885749385749395
Other-ContextualInformation 0.05773597145637365
Other-Discussion 0.047933157431838166
Other-Irrelevant 0.5729736154028999
Other-Sentiment 0.2922184524460983
Report-CleanUp 0.10335195530726257
Report-EmergingThreats 0.2861804222648753
Report-Factoid 0.4684362107885109
Report-FirstPartyObservation 0.11959829580036521
Report-Hashtags 0.2591567852437418
Report-Location 0.5047678556874677
Report-MultimediaShare 0.323185562965869
Report-News 0.3144718094850397
Report-NewSubEvent 0.06740815638692281
Report-Official 0.19974485146710405
Report-OriginalEvent 0.07652451175767239
Report-ServiceAvailable 0.38688016528925623
Report-ThirdPartyObservation 0.23619334696953387
Report-Weather 0.5130204501986171
Request-GoodsServices 0.24820378837361198
Request-InformationWanted 0.29184038117927336
Request-SearchAndRescue 0.09536784741144415


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)

perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")

for eventId in eventIdentifiers:
    tavgPrecision = 0.0
    tavgRecall = 0.0
    tavgF1 = 0.0
    tavgAccuracy = 0.0

    categoryCount = 0
    
    for categoryId in informationTypes2Index.keys():
        if sum(event2groundtruth[eventId].get(categoryId)) == 0:
            continue
        
        categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
        
        tavgPrecision = tavgPrecision + categoryPrecision
        tavgRecall = tavgRecall + categoryRecall
        tavgF1 = tavgF1 + categoryF1
        tavgAccuracy = tavgAccuracy + categoryAccuracy
        
        categoryCount += 1
    
    if categoryCount == 0:
        print("No categories for event:", eventId)
        continue
    
    print(eventId)
    print("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
    print("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
    print("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
    print("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
    print("")
    
    perEventFile.write(eventId+"\n")
    perEventFile.write("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
    perEventFile.write("\n")
    
perEventFile.write("\n")

2020_01_27_houston_explosion.2020
  Information Type Precision (positive class, multi-type, macro): 0.17816485780504956
  Information Type Recall (positive class, multi-type, macro): 0.33858387240769605
  Information Type F1 (positive class, multi-type, macro): 0.19140486181620728
  Information Type Accuracy (overall, multi-type, macro): 0.8674029711441239

2020_02_10_mideast_tornadoes.day1_mississipi.2020
  Information Type Precision (positive class, multi-type, macro): 0.4926425847628179
  Information Type Recall (positive class, multi-type, macro): 0.6038602654289633
  Information Type F1 (positive class, multi-type, macro): 0.4948457532729848
  Information Type Accuracy (overall, multi-type, macro): 0.8519668737060042

2020_02_10_mideast_tornadoes.day2_al.2020
  Information Type Precision (positive class, multi-type, macro): 0.22256742670909588
  Information Type Recall (positive class, multi-type, macro): 0.4570177591182607
  Information Type F1 (positive class, multi-type, macro): 0.26066210844276594
  Information Type Accuracy (overall, multi-type, macro): 0.8890412254160364

2020_02_10_mideast_tornadoes.day3_md.2019
  Information Type Precision (positive class, multi-type, macro): 0.13198391572691232
  Information Type Recall (positive class, multi-type, macro): 0.4264324946476504
  Information Type F1 (positive class, multi-type, macro): 0.152819265294107
  Information Type Accuracy (overall, multi-type, macro): 0.8313636363636364

2020_05_06_tn_derecho.2020
  Information Type Precision (positive class, multi-type, macro): 0.27161467496770997
  Information Type Recall (positive class, multi-type, macro): 0.44247597985227827
  Information Type F1 (positive class, multi-type, macro): 0.2741454453375455
  Information Type Accuracy (overall, multi-type, macro): 0.8700656455142233

brooklynblockparty_shooting.2019
  Information Type Precision (positive class, multi-type, macro): 0.16993944905537436
  Information Type Recall (positive class, multi-type, macro): 0.5444384257464007
  Information Type F1 (positive class, multi-type, macro): 0.181832651033989
  Information Type Accuracy (overall, multi-type, macro): 0.882476932866688

2016_puttingal_temple
  Information Type Precision (positive class, multi-type, macro): 0.16864927727531404
  Information Type Recall (positive class, multi-type, macro): 0.2918939467858047
  Information Type F1 (positive class, multi-type, macro): 0.17713979964640741
  Information Type Accuracy (overall, multi-type, macro): 0.8773302312054675

2017_12_04_thomas_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.2823265994519419
  Information Type Recall (positive class, multi-type, macro): 0.35656960404859966
  Information Type F1 (positive class, multi-type, macro): 0.2985493416396039
  Information Type Accuracy (overall, multi-type, macro): 0.8578469691950977

2017_12_07_lilac_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.28394336166332534
  Information Type Recall (positive class, multi-type, macro): 0.33067185833988355
  Information Type F1 (positive class, multi-type, macro): 0.2795401442758642
  Information Type Accuracy (overall, multi-type, macro): 0.8687661777394308

2018_07_23_klamathon_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.365639845919576
  Information Type Recall (positive class, multi-type, macro): 0.33440982688342863
  Information Type F1 (positive class, multi-type, macro): 0.3084209726926144
  Information Type Accuracy (overall, multi-type, macro): 0.8719822022027522

2018_08_05_holy_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.15287032740742684
  Information Type Recall (positive class, multi-type, macro): 0.40283631287969257
  Information Type F1 (positive class, multi-type, macro): 0.1764972655733302
  Information Type Accuracy (overall, multi-type, macro): 0.9126859600185961

2018_11_07_Woolsey_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.16225640154271617
  Information Type Recall (positive class, multi-type, macro): 0.25965437516549894
  Information Type F1 (positive class, multi-type, macro): 0.1703535677281617
  Information Type Accuracy (overall, multi-type, macro): 0.8716595979448986

2018_maryland_flood
  Information Type Precision (positive class, multi-type, macro): 0.2773842041635938
  Information Type Recall (positive class, multi-type, macro): 0.4306294060329144
  Information Type F1 (positive class, multi-type, macro): 0.29329628412680736
  Information Type Accuracy (overall, multi-type, macro): 0.8513078918117821

2018_pittsburgh_synagogue_shooting
  Information Type Precision (positive class, multi-type, macro): 0.3544372459835136
  Information Type Recall (positive class, multi-type, macro): 0.4195301339229375
  Information Type F1 (positive class, multi-type, macro): 0.3721807525478271
  Information Type Accuracy (overall, multi-type, macro): 0.7457264957264957

2019_03_01_alberta_wildfire.2019.v2
  Information Type Precision (positive class, multi-type, macro): 0.09741906159345742
  Information Type Recall (positive class, multi-type, macro): 0.2532989010517432
  Information Type F1 (positive class, multi-type, macro): 0.06653358701260942
  Information Type Accuracy (overall, multi-type, macro): 0.8472711612246496

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_08_25_hurricane_dorian.2019
  Information Type Precision (positive class, multi-type, macro): 0.24157082731800933
  Information Type Recall (positive class, multi-type, macro): 0.27353710899680916
  Information Type F1 (positive class, multi-type, macro): 0.19726826285643495
  Information Type Accuracy (overall, multi-type, macro): 0.860149130074565

2019_10_10_saddleridge_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.26265262642834897
  Information Type Recall (positive class, multi-type, macro): 0.32762409669708903
  Information Type F1 (positive class, multi-type, macro): 0.2632918546851061
  Information Type Accuracy (overall, multi-type, macro): 0.9044416640600561

2019_10_25_kincade_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.2589920485692613
  Information Type Recall (positive class, multi-type, macro): 0.3989241547025225
  Information Type F1 (positive class, multi-type, macro): 0.2815084118601753
  Information Type Accuracy (overall, multi-type, macro): 0.8867856162503432

2019_durham_gas_explosion
  Information Type Precision (positive class, multi-type, macro): 0.2579570079406902
  Information Type Recall (positive class, multi-type, macro): 0.40794465982146855
  Information Type F1 (positive class, multi-type, macro): 0.277855718786277
  Information Type Accuracy (overall, multi-type, macro): 0.8694110725786476

2019_saugus_high_school_shooting
  Information Type Precision (positive class, multi-type, macro): 0.22815138403975047
  Information Type Recall (positive class, multi-type, macro): 0.3256892664372737
  Information Type F1 (positive class, multi-type, macro): 0.2257526105477705
  Information Type Accuracy (overall, multi-type, macro): 0.8820565359203815

2019_townsville_flood
  Information Type Precision (positive class, multi-type, macro): 0.28454491050485103
  Information Type Recall (positive class, multi-type, macro): 0.3060754290533274
  Information Type F1 (positive class, multi-type, macro): 0.2681208896738671
  Information Type Accuracy (overall, multi-type, macro): 0.8728018757327082

2020_easter_tornado_outbreak
  Information Type Precision (positive class, multi-type, macro): 0.15236351277558882
  Information Type Recall (positive class, multi-type, macro): 0.3826981363939142
  Information Type F1 (positive class, multi-type, macro): 0.18378258542559908
  Information Type Accuracy (overall, multi-type, macro): 0.861257847190323

2020_tornado_outbreak_of_april
  Information Type Precision (positive class, multi-type, macro): 0.26080618214149
  Information Type Recall (positive class, multi-type, macro): 0.3973646962787608
  Information Type F1 (positive class, multi-type, macro): 0.27311886757216813
  Information Type Accuracy (overall, multi-type, macro): 0.8705057000876935

2020_tornado_outbreak_of_march
  Information Type Precision (positive class, multi-type, macro): 0.20126037432890512
  Information Type Recall (positive class, multi-type, macro): 0.516913736048257
  Information Type F1 (positive class, multi-type, macro): 0.21260231825179157
  Information Type Accuracy (overall, multi-type, macro): 0.8202202573198498

2020_visakhapatnam_gas_leak
  Information Type Precision (positive class, multi-type, macro): 0.27972408890516864
  Information Type Recall (positive class, multi-type, macro): 0.18740094007495997
  Information Type F1 (positive class, multi-type, macro): 0.17608802069271043
  Information Type Accuracy (overall, multi-type, macro): 0.839653261557948

tornado_outbreak_of_november_30_december_2018
  Information Type Precision (positive class, multi-type, macro): 0.2250922749279336
  Information Type Recall (positive class, multi-type, macro): 0.5012512296545958
  Information Type F1 (positive class, multi-type, macro): 0.2425174424083964
  Information Type Accuracy (overall, multi-type, macro): 0.8783892051444235

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)

N = len(eventIdentifiers)
ind = np.arange(N)

scoresPerEventF1 = []
for eventId in eventIdentifiers:
    avgF1_ = 0.0
    
    for categoryId in informationTypes2Index.keys():
        avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        
    scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerEventF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)

from sklearn.metrics import mean_squared_error

priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]

    f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    priorityAvgf1 = priorityAvgf1 + f1;
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgf1High = priorityAvgf1High + f1
    else:
        priorityAvgf1Low = priorityAvgf1Low + f1
    
    
    
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
    
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")

Priority Label Prediction (F1, macro): 0.15649107505043827

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases

priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
    if categoryId == "Other-Irrelevant":
        continue
        
    groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
    predictedPriorities = category2PredictedPriorityScore[categoryId]

    # Pathological case when no variation exists in the predictions needs to be handled
    this_corr = 0.0
    if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
        this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
    priorityAvgCorr = priorityAvgCorr + this_corr
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
    else:
        priorityAvgCorrLow = priorityAvgCorrLow + this_corr
    
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))


resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")

Priority Score Prediction (Pearson): 0.22628964537181098
Priority Score Prediction, High (Pearson): 0.17003912951300437
Priority Score Prediction, Low (Pearson): 0.24503981732474647

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)

N = len(informationTypes2Index)
ind = np.arange(N)

priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]
    priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    if (math.isnan(priorityCatF1)):
        priorityCatF1 = 0.0
    categoryLabels.append(categoryId)
    priorityCatF1Values.append(priorityCatF1);
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, priorityCatF1Values, width)

plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()


resultLine = None

# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")

resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
     " & "+
     str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgF1/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
     " & "+
     str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
     " \\\\")

print(runName+" & "+resultLine)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")

Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\
njit_augly & 0.4242 & 0.2441 & 0.2729 & 0.8811 & 0.1723 & 0.1565 & 0.1700 & 0.2172 \\

86


# Done
resultsFile.close() 
perTopicFile.close()
perEventFile.close()


# header = [
#     "Run",
#     "date",
#     "team",
#     "description",
#     "paper",
#     "code",
#     "nDCG@100",
#     "Info-Type F1 [Actionable]",
#     "Info-Type F1 [All]",
#     "Info-Type Accuracy",
#     "Priority F1 [Actionable]",
#     "Priority F1 [All]",
#     "Priority R [Actionable]",
#     "Priority R [All]",
# ]

import csv
if os.path.isfile("metadata.json"):
    this_cwd = os.getcwd()
    sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
    sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
    
    leaderboard_entry = None
    with open("metadata.json", "r") as in_file:
        
        metadata = json.load(in_file)
        
        leaderboard_entry = [
            runName,
            sub_date,
            metadata["organization"].lower(),
            metadata["model_description"],
            metadata["paper"] if metadata["paper"].startswith("http") else "",
            metadata["code"] if metadata["code"].startswith("http") else "",
            str.format('{0:.4f}',system_ndcg_micro),
            str.format('{0:.4f}',avgF1High/numHighInformationTypes),
            str.format('{0:.4f}',avgF1/numInformationTypes),
            str.format('{0:.4f}',avgAccuracy/numInformationTypes),
            str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
            str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
        ]
        
    with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
        leader_writer = csv.writer(csvResultsFile)
        leader_writer.writerow(leaderboard_entry)