# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"

import os
cwd = os.getcwd()


# Configuration Information

# Do we try and normalize the run priority scores?
enablePriorityNorm = True

# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5

taskCategories = [
    "CallToAction-Donations",
    "CallToAction-MovePeople",
    "CallToAction-Volunteer",
    "Other-Advice",
    "Other-ContextualInformation",
    "Other-Discussion",
    "Other-Irrelevant",
    "Other-Sentiment",
    "Report-CleanUp",
    "Report-EmergingThreats",
    "Report-Factoid",
    "Report-FirstPartyObservation",
    "Report-Hashtags",
    "Report-Location",
    "Report-MultimediaShare",
    "Report-News",
    "Report-NewSubEvent",
    "Report-Official",
    "Report-OriginalEvent",
    "Report-ServiceAvailable",
    "Report-ThirdPartyObservation",
    "Report-Weather",
    "Request-GoodsServices",
    "Request-InformationWanted",
    "Request-SearchAndRescue",
]

# What we consider to be highly important categories of information
highImportCategories = [
    "Request-GoodsServices",
    "Request-SearchAndRescue",
    "CallToAction-MovePeople",
    "Report-EmergingThreats",
    "Report-NewSubEvent",
    "Report-ServiceAvailable"
]

highImportCategoriesShort = [
    "GoodsServices",
    "SearchAndRescue",
    "MovePeople",
    "EmergingThreats",
    "NewSubEvent",
    "ServiceAvailable"
]

# Priority map
priorityScoreMap = {
    "Critical": 1.0,
    "High": 0.75,
    "Medium": 0.5,
    "Low": 0.25,
    "Unknown": 0.25,
}

# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected


# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
#     '2015_09_28_hurricane_joaquin.2015',
#     '2017_03_23_cyclone_debbie.2017',
#     '2018_02_24_anticyclone_hartmut.2018',
#     '2018_07_13_ferguson_wildfire.2018',
#     '2018_07_23_cranston_wildfire.2018',
#     '2018_09_07_hurricane_florence.2018',
#     '2018_10_07_hurricane_michael.2018',
#     '2019_09_17_tropicalstorm_imelda.2019',
#     '2019_karnataka_floods',
#     '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
#     '2020_01_28_bar_shooting_nc.2020',
#     '2020_02_07_rutherford_tn_floods.2020',
#     '2020_05_26_edenville_dam_failure.2020.corrected',
#     '2020_08_27_hurricane_laura.2020',
#     '2020_09_11_hurricane_sally.2020',
#     '2020_afghanistan_flood',
#     '2020_hpakant_jade_mine_disaster',
#     '2020_kerala_floods',
#     'T2020_02_03_texas_university_shooting.2020',
#     'UNASSIGNED',
#     'indonesia_earthquake.2019'
    
    "2020_05_26_edenville_dam_failure.2020.corrected",
    "2018_10_07_hurricane_michael.2018",
    "2020_01_28_bar_shooting_nc.2020",
    "T2020_02_03_texas_university_shooting.2020",
    "2020_02_07_rutherford_tn_floods.2020",
    "UNASSIGNED",
    "indonesia_earthquake.2019",
    "2015_09_28_hurricane_joaquin.2015",
    "2017_03_23_cyclone_debbie.2017",
    "2018_02_24_anticyclone_hartmut.2018",
    "2018_07_13_ferguson_wildfire.2018",
    "2018_07_23_cranston_wildfire.2018",
    "2018_09_07_hurricane_florence.2018",
    "2019_09_17_tropicalstorm_imelda.2019",
    "2019_karnataka_floods",
    "2019_spring_floods_in_ontario_quebec_and_new_brunswick",
    "2020_08_27_hurricane_laura.2020",
    "2020_09_11_hurricane_sally.2020",
    "2020_afghanistan_flood",
    "2020_hpakant_jade_mine_disaster",
    "2020_kerala_floods",
]


import glob

runFile = None
for f in glob.glob("*.gz"):
    runFile = f

print("Run File:", f)

Run File: run.json.gz


import gzip
import json


runName = None

with gzip.open(runFile, "r") as inRunFile:
    for line in inRunFile:
        line = line.decode("utf8")
#         runName = line.rpartition("\t")[2].strip()
        runName = json.loads(line)["runtag"]
        break

print("Run Name:", runName)

Run Name: njit_label_prop


# Do we try and normalize the run priority scores?
enablePriorityNorm = False

dataDir = "../../data/2021b"

# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir

# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
#     "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
#     "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
#     "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
    "%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]

# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir


topicArray = []

with open(topicsFile, "r") as inTopicsFile:
    
    topicNum = None
    topicDataset = None
    
    for line_ in inTopicsFile:
        line = line_.strip()
        
        if line == "</top>":
            if topicDataset in skipEvents:
                continue
            topicArray.append((topicDataset, topicNum))
            
        if line.startswith("<num>"):
            topicNum = line.partition("<num>")[2].partition("</num>")[0]
            
        if line.startswith("<dataset>"):
            topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
            
for row in topicArray:
    print(row)

('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076')
('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080')
('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081')
('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082')
('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083')
('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085')
('2016_puttingal_temple', 'TRECIS-CTIT-H-089')
('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091')
('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092')
('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096')
('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097')
('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100')
('2018_maryland_flood', 'TRECIS-CTIT-H-101')
('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102')
('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103')
('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104')
('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106')
('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107')
('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108')
('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110')
('2019_townsville_flood', 'TRECIS-CTIT-H-112')
('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116')
('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119')
('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120')
('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121')
('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')


# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())

resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")

perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")

perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")

1


# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# --------------------------------------------------
# Stage 1: Load the ground truth dataset 
# --------------------------------------------------

groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
    print("Reading "+groundtruthFile)
    with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:    
        groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])

# --------------------------------------------------
# Stage 2: Load run file 
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
#     runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
    runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])

Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json


# --------------------------------------------------
# Stage 3: Load the categories 
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:    
    ontologyJSON = json.load(ontologyJSONFile)

informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index

for informationTypeJSON in ontologyJSON["informationTypes"]:
    informationTypeId = informationTypeJSON["id"]
    
    informationTypeIndex = taskCategories.index(informationTypeId)
    informationTypes2Index[informationTypeId] = informationTypeIndex
    informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex


# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
#        not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events

tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []


invertedPriorityScoreMap = {
    v:k for k,v in priorityScoreMap.items()
}

tweetIndex = 0
for groundtruth in groundtruthJSON:
    for eventJSON in groundtruth["events"]:
        eventid = eventJSON["eventid"]
        print(eventid)
        
        if eventid in skipEvents:
            continue
        
        if not event2tweetIds.get(eventid):
            event2tweetIds[eventid] = []
        
        if any(eventid in s for s in eventIdentifiers):
            # iterate over tweets in the event
            for tweetJSON in eventJSON["tweets"]:
                tweetid = tweetJSON["postID"]
                categories = tweetJSON["postCategories"]
                priority = tweetJSON["postPriority"]
                
                if priority == "High" or priority == "Critical":
                    countHighCriticalImport = countHighCriticalImport + 1
                
                if priority == "Low" or priority == "Medium":
                    countLowMediumImport = countLowMediumImport + 1
                
                # check categories for name issues and correct if possible
                cleanedCategories = []
                highImportCats = []
                lowImportCats = []
                for categoryId in categories:
                    if not any(categoryId in s for s in informationTypesShort2Index.keys()):
#                         print("Found unknown category in ground truth "+categoryId+", ignoring...")
                        pass
                    else:
                        cleanedCategories.append(categoryId)
                        if any(categoryId in s for s in highImportCategoriesShort):
                            highImportCats.append(categoryId)
                        else:
                            lowImportCats.append(categoryId)
    
                if tweetid not in tweetsSeen:
                    event2tweetIds[eventid].append(tweetid)
                    tweetId2TRECInfoCategories[tweetid] = cleanedCategories
                    tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
                    tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
                    tweetId2TRECPriorityCategory[tweetid] = priority
                    index2TweetId[tweetIndex] = tweetid;
                    tweetIndex = tweetIndex + 1
                    tweetsSeen.append(tweetid)

                else:
                    tweetId2TRECInfoCategories[tweetid] = list(set(
                        cleanedCategories + tweetId2TRECInfoCategories[tweetid]
                    ))
                    
                    prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
                    thisPriorityScore = priorityScoreMap[priority]
                    
                    tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
                        max(prePriorityScore, thisPriorityScore)
                    ]

                
        else:
            print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")

2020_01_27_houston_explosion.2020
2020_01_28_bar_shooting_nc.2020
T2020_02_03_texas_university_shooting.2020
2020_02_07_rutherford_tn_floods.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
2020_05_26_edenville_dam_failure.2020.corrected
brooklynblockparty_shooting.2019
UNASSIGNED
indonesia_earthquake.2019
2015_09_28_hurricane_joaquin.2015
2016_puttingal_temple
2017_03_23_cyclone_debbie.2017
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_02_24_anticyclone_hartmut.2018
2018_07_13_ferguson_wildfire.2018
2018_07_23_cranston_wildfire.2018
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_09_07_hurricane_florence.2018
2018_10_07_hurricane_michael.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_09_17_tropicalstorm_imelda.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_karnataka_floods
2019_saugus_high_school_shooting
2019_spring_floods_in_ontario_quebec_and_new_brunswick
2019_townsville_flood
2020_08_27_hurricane_laura.2020
2020_09_11_hurricane_sally.2020
2020_afghanistan_flood
2020_easter_tornado_outbreak
2020_hpakant_jade_mine_disaster
2020_kerala_floods
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)

maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999

for predictionParts in runContents:
    
    #print(runLine)
    if (len(predictionParts)<6 ):
        print(runLine)
        continue
    else:
        eventId = predictionParts["topic"]
        
        if eventId in skipEvents:
            continue
        
        tweetId = predictionParts["tweet_id"]
        rank = 0
        #print(predictionParts[5])

        category_scores = predictionParts["info_type_scores"]
        category_labels = predictionParts["info_type_labels"]

        priority = float(predictionParts["priority"])
        
        if priority > maxPrediction:
            maxPrediction = priority
        if priority < minPrediction:
            minPrediction = priority
        
        cleanedCategories = []
        cleanedCategoriesProbs = []
        highImportCats = []
        lowImportCats = []
        
        # Handle category flags
        for catIndex, categoryLabel in enumerate(category_labels):
            # check if we have a binary flag for this label
            if categoryLabel == 0:
                # False flag, so skip
                continue
                
            categoryId = taskCategories[catIndex]
            
            if not any(categoryId in s for s in informationTypes2Index.keys()):
                print("Found unknown category in run "+categoryId+", ignoring...")
            else:
                cleanedCategories.append(categoryId)
                if any(categoryId in s for s in highImportCategories):
                    highImportCats.append(categoryId)
                else:
                    lowImportCats.append(categoryId)
                    
        # Process category probabilities
        for categoryProbability in category_scores:
            
            if categoryProbability > maxCategory:
                maxCategory = categoryProbability
            if categoryProbability < minCategory:
                minCategory = categoryProbability
            
            cleanedCategoriesProbs.append(categoryProbability)
                
        tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
        tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
        tweetId2RunInfoCategories[tweetId] = cleanedCategories
        tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
        tweetId2RunPriorityScore[tweetId] = priority
        
        if priority > priorityScoreMap["High"]:
            tweetId2RunPriorityCategory[tweetId] = "Critical"
        elif priority > priorityScoreMap["Medium"]:
            tweetId2RunPriorityCategory[tweetId] = "High"
        elif priority > priorityScoreMap["Low"]:
            tweetId2RunPriorityCategory[tweetId] = "Medium"
        else:
            tweetId2RunPriorityCategory[tweetId] = "Low"
        
        if not event2TweetIdRank.get(eventId):
            event2TweetIdRank[eventId] = []
        rankTuple = (tweetId,rank)
        event2TweetIdRank.get(eventId).append(rankTuple)


for eventId in event2TweetIdRank.keys():
    tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
    event2TweetIdRank[eventId] = tweetsSorted
    
for i in range(len(index2TweetId)):
    tweetId = index2TweetId[i]
    if tweetId2RunPriorityScore.get(tweetId):
        
        if enablePriorityNorm:
            if (minPrediction-minPrediction) == 0.0:
                tweetId2RunPriorityScoreNorm[tweetId] = 0.0
            else:
                tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
        else:
            tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
    else:
        tweetId2RunPriorityScoreNorm[tweetId] = 0.0


# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------

category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        #pprint(categories)
        if any(categoryIdShort in s for s in categories):
            categoryVector.append(1)
        else:
            categoryVector.append(0)
    category2GroundTruth[categoryId] = categoryVector
            
#pprint(category2GroundTruth)


# --------------------------------------------------
# Stage 7: Create run vectors per category 
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
#              no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        
        if tweetId2RunInfoCategories.get(tweetId):
            categories = tweetId2RunInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
        else:
            categoryVector.append(0)

    category2Predicted[categoryId] = categoryVector

#pprint(category2Predicted)


# --------------------------------------------------
# Stage 8: Make event category vectors 
# --------------------------------------------------

event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(eventId)
        for tweetId in event2tweetIds.get(eventId):
#             print(tweetId)
            categories = tweetId2TRECInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2groundtruth[eventId] = eventCategories
    

event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
    print(eventId)
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(tweetId)
        for tweetId in event2tweetIds.get(eventId):
            #print(tweetId)
            categories = tweetId2RunInfoCategories.get(tweetId)
            
            if categories == None:
                categories = json.loads("[]")
                tweetId2RunInfoCategories[tweetId] = categories
            
            if any(categoryId in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2prediction[eventId] = eventCategories

2020_01_27_houston_explosion.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
brooklynblockparty_shooting.2019
2016_puttingal_temple
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_saugus_high_school_shooting
2019_townsville_flood
2020_easter_tornado_outbreak
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------

category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    priorityVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            priority = tweetId2TRECPriorityCategory.get(tweetId)
            priorityVector.append(priority)
    category2GroundTruthPriority[categoryId] = priorityVector

category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    categoryScoreVector = []
    
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            if tweetId2RunPriorityCategory.get(tweetId):
                priority = tweetId2RunPriorityCategory.get(tweetId)
                priorityScore = tweetId2RunPriorityScore.get(tweetId)
            
                categoryVector.append(priority)
                categoryScoreVector.append(priorityScore)
            else:
                categoryVector.append("Low") # default to low priority
                categoryScoreVector.append(0.25)

    category2PredictedPriority[categoryId] = categoryVector
    category2PredictedPriorityScore[categoryId] = categoryScoreVector


# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories


# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------

import pandas as pd

def calc_dcg(scores, at_k=100):
    position = 1
    accumulator = 0.0
    for score in scores[:at_k]:

        numerator = 2 ** score - 1
        denom = np.log2(position + 1)

        accumulator += numerator / denom
        position += 1

    return accumulator

priority_map = {
    "Unknown": 1,
    "Low": 1,
    "Medium": 2,
    "High": 3,
    "Critical": 4,
}

at_k = 100

tweetId2TRECPriorityCategory_score = {
    k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
    tweetId2TRECPriorityCategory_score.values(),
    reverse=True
)

best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print(event)
    
    tweetId2TRECPriorityCategory_scores_sorted = sorted(
        [tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
        reverse=True
    )
    ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
    print("\tBest DCG:", ideal_dcg)
    best_dcg_per_event[event] = ideal_dcg
    
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()

# Code below calculates the DCG for a system's 
#  ranked priority tweets. We have to do some 
#  sampling here to break ties among tweets with
#  the same priority scores.

# Build a dataframe from the system's provided
#  priority scores, so we can identify what the
#  top-most priorities are and get a count of
#  the number of tweets in each priority bin.
priority_df = pd.DataFrame(
    [(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
    columns=["tweet_id", "priority"]
)

# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print("Event:", event)
    local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
    
    unique_scores = local_priority_df["priority"].value_counts()
    
    # Find the top priority scores that would be included
    #  in the necessary at_k values.
    total = 0
    top_keys = []
    candidates = {}
    for top in sorted(unique_scores.index, reverse=True):

        # We store this key, so we can go back and shuffle
        #. tweets with this score.
        top_keys.append(top)
        local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
        candidates[top] = list(local_restricted_df["tweet_id"])

        total += local_restricted_df.shape[0]

        # Once we have enough samples, stop.
        if ( total > at_k ):
            break

    # Now we generate distribution over the DCG for this
    #  system and do this a number of times to remove
    #  dependence on our selection of the top k tweets
    random_dcgs = []
    for i in range(100):

        local_tweet_ids = []
        for top in top_keys:
            this_top_tweets = candidates[top][:]
            np.random.shuffle(this_top_tweets)

            needed = at_k - len(local_tweet_ids)
            local_tweet_ids.extend(this_top_tweets[:needed])

        local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]

        random_dcgs.append(calc_dcg(local_scores))

    system_dcg = np.mean(random_dcgs)

    system_ndcg_ = system_dcg / best_dcg_per_event[event]
    print("\tnDCG:", system_ndcg_)
    system_dcg_per_event[event] = system_ndcg_
    
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")

2020_01_27_houston_explosion.2020
	Best DCG: 176.99559032459564
2020_02_10_mideast_tornadoes.day1_mississipi.2020
	Best DCG: 268.88459894996123
2020_02_10_mideast_tornadoes.day2_al.2020
	Best DCG: 270.1716952398847
2020_02_10_mideast_tornadoes.day3_md.2019
	Best DCG: 135.38775246204446
2020_05_06_tn_derecho.2020
	Best DCG: 167.06354661312534
brooklynblockparty_shooting.2019
	Best DCG: 179.1756130795261
2016_puttingal_temple
	Best DCG: 314.08006311421406
2017_12_04_thomas_wildfire.2017
	Best DCG: 300.71399384300895
2017_12_07_lilac_wildfire.2017
	Best DCG: 314.08006311421406
2018_07_23_klamathon_wildfire.2018
	Best DCG: 221.46334445469358
2018_08_05_holy_wildfire.2018
	Best DCG: 153.96993418707177
2018_11_07_Woolsey_wildfire.2018
	Best DCG: 175.67469323453255
2018_maryland_flood
	Best DCG: 285.7119531591263
2018_pittsburgh_synagogue_shooting
	Best DCG: 111.85075929877581
2019_03_01_alberta_wildfire.2019.v2
	Best DCG: 62.88708564345522
2019_08_25_hurricane_dorian.2019
	Best DCG: 146.57069611996656
2019_10_10_saddleridge_wildfire.2019
	Best DCG: 173.00802656786584
2019_10_25_kincade_wildfire.2019
	Best DCG: 314.08006311421406
2019_durham_gas_explosion
	Best DCG: 201.07148118577902
2019_saugus_high_school_shooting
	Best DCG: 314.08006311421406
2019_townsville_flood
	Best DCG: 314.08006311421406
2020_easter_tornado_outbreak
	Best DCG: 214.9714167256293
2020_tornado_outbreak_of_april
	Best DCG: 314.08006311421406
2020_tornado_outbreak_of_march
	Best DCG: 267.51977363880474
2020_visakhapatnam_gas_leak
	Best DCG: 314.08006311421406
tornado_outbreak_of_november_30_december_2018
	Best DCG: 314.08006311421406
Mean: 231.7589407554446

Event: 2020_01_27_houston_explosion.2020
	nDCG: 0.27780543551693
Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020
	nDCG: 0.4220503544399323
Event: 2020_02_10_mideast_tornadoes.day2_al.2020
	nDCG: 0.3745338181274062
Event: 2020_02_10_mideast_tornadoes.day3_md.2019
	nDCG: 0.4061232730457823
Event: 2020_05_06_tn_derecho.2020
	nDCG: 0.43527383514448037
Event: brooklynblockparty_shooting.2019
	nDCG: 0.17289493977359197
Event: 2016_puttingal_temple
	nDCG: 0.2527608591013886
Event: 2017_12_04_thomas_wildfire.2017
	nDCG: 0.35332605501113173
Event: 2017_12_07_lilac_wildfire.2017
	nDCG: 0.3656410206279158
Event: 2018_07_23_klamathon_wildfire.2018
	nDCG: 0.49783284897545427
Event: 2018_08_05_holy_wildfire.2018
	nDCG: 0.452843321365631
Event: 2018_11_07_Woolsey_wildfire.2018
	nDCG: 0.31769939055888796
Event: 2018_maryland_flood
	nDCG: 0.3458532632045162
Event: 2018_pittsburgh_synagogue_shooting
	nDCG: 0.8796077371132954
Event: 2019_03_01_alberta_wildfire.2019.v2
	nDCG: 0.3567178481823365
Event: 2019_08_25_hurricane_dorian.2019
	nDCG: 0.35378918970888484
Event: 2019_10_10_saddleridge_wildfire.2019
	nDCG: 0.4265909143894533
Event: 2019_10_25_kincade_wildfire.2019
	nDCG: 0.6245649491090363
Event: 2019_durham_gas_explosion
	nDCG: 0.2920285260422452
Event: 2019_saugus_high_school_shooting
	nDCG: 0.28266031580255163
Event: 2019_townsville_flood
	nDCG: 0.6049019612679973
Event: 2020_easter_tornado_outbreak
	nDCG: 0.40805012915193695
Event: 2020_tornado_outbreak_of_april
	nDCG: 0.5689650450614729
Event: 2020_tornado_outbreak_of_march
	nDCG: 0.19530535717066963
Event: 2020_visakhapatnam_gas_leak
	nDCG: 0.5444961776633392
Event: tornado_outbreak_of_november_30_december_2018
	nDCG: 0.7454005185324978

System Event-Micro nDCG: 0.42145065708033713

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0

avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0

avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0

for categoryId in informationTypes2Index.keys():
    categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
    
    avgPrecision = avgPrecision + categoryPrecision
    avgRecall = avgRecall + categoryRecall
    avgF1 = avgF1 + categoryF1
    avgAccuracy = avgAccuracy + categoryAccuracy
    
    if any(categoryId in s for s in highImportCategories):
        avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
        avgRecallHigh = avgRecallHigh + categoryRecall
        avgF1High = avgF1High + categoryF1
        avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
    else:
        avgPrecisionLow = avgPrecisionLow + categoryPrecision
        avgRecallLow = avgRecallLow + categoryRecall
        avgF1Low = avgF1Low + categoryF1
        avgAccuracyLow = avgAccuracyLow + categoryAccuracy

numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
        
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))

print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))

print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")

Information Type Precision (positive class, multi-type, macro): 0.26161177246515116
Information Type Recall (positive class, multi-type, macro): 0.2935170483437286
Information Type F1 (positive class, multi-type, macro): 0.26004632897554286
Information Type Accuracy (overall, multi-type, macro): 0.8964102931432766
High Importance Information Type Precision (positive class, multi-type, macro): 0.23731530472421372
High Importance Information Type Recall (positive class, multi-type, macro): 0.1968659208249782
High Importance Information Type F1 (positive class, multi-type, macro): 0.20079776340090597
High Importance Information Type Accuracy (overall, multi-type, macro): 0.9615588942809902
Low Importance Information Type Precision (positive class, multi-type, macro): 0.2692843412254472
Low Importance Information Type Recall (positive class, multi-type, macro): 0.3240384570338603
Low Importance Information Type F1 (positive class, multi-type, macro): 0.27875640231490195
Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8758370506787354

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here, 
# primaraly with respect to recall, as we want the user to 
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)

from sklearn.metrics import classification_report

perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")

for categoryId in informationTypes2Index.keys():
    target_names = ['Other Classes', categoryId]
    try:
        print(categoryId)
        print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))


        perTopicFile.write(categoryId+"\n")
        perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
        perTopicFile.write(""+"\n")
      
    except ValueError:
        print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")

CallToAction-Donations
                        precision    recall  f1-score   support

         Other Classes       0.99      0.99      0.99     55275
CallToAction-Donations       0.30      0.47      0.37       568

              accuracy                           0.98     55843
             macro avg       0.65      0.73      0.68     55843
          weighted avg       0.99      0.98      0.99     55843

CallToAction-MovePeople
                         precision    recall  f1-score   support

          Other Classes       0.98      0.99      0.99     54646
CallToAction-MovePeople       0.42      0.24      0.31      1197

               accuracy                           0.98     55843
              macro avg       0.70      0.62      0.65     55843
           weighted avg       0.97      0.98      0.97     55843

CallToAction-Volunteer
                        precision    recall  f1-score   support

         Other Classes       1.00      1.00      1.00     55543
CallToAction-Volunteer       0.25      0.18      0.21       300

              accuracy                           0.99     55843
             macro avg       0.62      0.59      0.60     55843
          weighted avg       0.99      0.99      0.99     55843

Other-Advice
               precision    recall  f1-score   support

Other Classes       0.96      0.97      0.96     52602
 Other-Advice       0.38      0.33      0.35      3241

     accuracy                           0.93     55843
    macro avg       0.67      0.65      0.66     55843
 weighted avg       0.93      0.93      0.93     55843

Other-ContextualInformation
                             precision    recall  f1-score   support

              Other Classes       0.97      0.95      0.96     54346
Other-ContextualInformation       0.02      0.04      0.03      1497

                   accuracy                           0.93     55843
                  macro avg       0.50      0.50      0.50     55843
               weighted avg       0.95      0.93      0.94     55843

Other-Discussion
                  precision    recall  f1-score   support

   Other Classes       0.99      0.95      0.97     55263
Other-Discussion       0.03      0.14      0.05       580

        accuracy                           0.95     55843
       macro avg       0.51      0.55      0.51     55843
    weighted avg       0.98      0.95      0.96     55843

Other-Irrelevant
                  precision    recall  f1-score   support

   Other Classes       0.53      0.82      0.64     23267
Other-Irrelevant       0.79      0.48      0.60     32576

        accuracy                           0.62     55843
       macro avg       0.66      0.65      0.62     55843
    weighted avg       0.68      0.62      0.62     55843

Other-Sentiment
                 precision    recall  f1-score   support

  Other Classes       0.94      0.93      0.94     51270
Other-Sentiment       0.31      0.34      0.32      4573

       accuracy                           0.88     55843
      macro avg       0.62      0.63      0.63     55843
   weighted avg       0.89      0.88      0.89     55843

Report-CleanUp
                precision    recall  f1-score   support

 Other Classes       1.00      0.99      0.99     55581
Report-CleanUp       0.13      0.30      0.18       262

      accuracy                           0.99     55843
     macro avg       0.56      0.65      0.59     55843
  weighted avg       0.99      0.99      0.99     55843

Report-EmergingThreats
                        precision    recall  f1-score   support

         Other Classes       0.96      0.92      0.94     52454
Report-EmergingThreats       0.23      0.36      0.28      3389

              accuracy                           0.89     55843
             macro avg       0.59      0.64      0.61     55843
          weighted avg       0.91      0.89      0.90     55843

Report-Factoid
                precision    recall  f1-score   support

 Other Classes       0.93      0.94      0.94     49844
Report-Factoid       0.48      0.45      0.46      5999

      accuracy                           0.89     55843
     macro avg       0.71      0.69      0.70     55843
  weighted avg       0.88      0.89      0.89     55843

Report-FirstPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.97      0.96      0.97     54135
Report-FirstPartyObservation       0.10      0.14      0.11      1708

                    accuracy                           0.93     55843
                   macro avg       0.53      0.55      0.54     55843
                weighted avg       0.95      0.93      0.94     55843

Report-Hashtags
                 precision    recall  f1-score   support

  Other Classes       0.89      0.89      0.89     48407
Report-Hashtags       0.31      0.32      0.31      7436

       accuracy                           0.81     55843
      macro avg       0.60      0.60      0.60     55843
   weighted avg       0.82      0.81      0.82     55843

Report-Location
                 precision    recall  f1-score   support

  Other Classes       0.84      0.74      0.78     41325
Report-Location       0.44      0.59      0.50     14518

       accuracy                           0.70     55843
      macro avg       0.64      0.66      0.64     55843
   weighted avg       0.73      0.70      0.71     55843

Report-MultimediaShare
                        precision    recall  f1-score   support

         Other Classes       0.92      0.75      0.83     48784
Report-MultimediaShare       0.24      0.54      0.33      7059

              accuracy                           0.73     55843
             macro avg       0.58      0.65      0.58     55843
          weighted avg       0.83      0.73      0.77     55843

Report-News
               precision    recall  f1-score   support

Other Classes       0.93      0.82      0.87     50324
  Report-News       0.23      0.47      0.31      5519

     accuracy                           0.79     55843
    macro avg       0.58      0.65      0.59     55843
 weighted avg       0.86      0.79      0.82     55843

Report-NewSubEvent
                    precision    recall  f1-score   support

     Other Classes       0.98      0.98      0.98     54728
Report-NewSubEvent       0.05      0.07      0.06      1115

          accuracy                           0.96     55843
         macro avg       0.52      0.52      0.52     55843
      weighted avg       0.96      0.96      0.96     55843

Report-Official
                 precision    recall  f1-score   support

  Other Classes       0.96      0.98      0.97     53203
Report-Official       0.20      0.11      0.14      2640

       accuracy                           0.94     55843
      macro avg       0.58      0.54      0.55     55843
   weighted avg       0.92      0.94      0.93     55843

Report-OriginalEvent
                      precision    recall  f1-score   support

       Other Classes       0.95      0.98      0.97     52838
Report-OriginalEvent       0.10      0.03      0.04      3005

            accuracy                           0.93     55843
           macro avg       0.52      0.51      0.50     55843
        weighted avg       0.90      0.93      0.92     55843

Report-ServiceAvailable
                         precision    recall  f1-score   support

          Other Classes       0.97      0.98      0.98     53834
Report-ServiceAvailable       0.40      0.32      0.35      2009

               accuracy                           0.96     55843
              macro avg       0.68      0.65      0.66     55843
           weighted avg       0.95      0.96      0.96     55843

Report-ThirdPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.91      0.84      0.87     50379
Report-ThirdPartyObservation       0.16      0.28      0.20      5464

                    accuracy                           0.78     55843
                   macro avg       0.54      0.56      0.54     55843
                weighted avg       0.84      0.78      0.81     55843

Report-Weather
                precision    recall  f1-score   support

 Other Classes       0.97      0.90      0.93     50824
Report-Weather       0.41      0.70      0.52      5019

      accuracy                           0.88     55843
     macro avg       0.69      0.80      0.73     55843
  weighted avg       0.92      0.88      0.90     55843

Request-GoodsServices
                       precision    recall  f1-score   support

        Other Classes       0.99      1.00      1.00     55452
Request-GoodsServices       0.23      0.05      0.09       391

             accuracy                           0.99     55843
            macro avg       0.61      0.53      0.54     55843
         weighted avg       0.99      0.99      0.99     55843

Request-InformationWanted
                           precision    recall  f1-score   support

            Other Classes       0.99      0.99      0.99     55241
Request-InformationWanted       0.26      0.26      0.26       602

                 accuracy                           0.98     55843
                macro avg       0.63      0.63      0.63     55843
             weighted avg       0.98      0.98      0.98     55843

Request-SearchAndRescue
                         precision    recall  f1-score   support

          Other Classes       1.00      1.00      1.00     55737
Request-SearchAndRescue       0.10      0.14      0.12       106

               accuracy                           1.00     55843
              macro avg       0.55      0.57      0.56     55843
           weighted avg       1.00      1.00      1.00     55843

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)



N = len(informationTypes2Index)
ind = np.arange(N)

scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    print(categoryId, localF1Score)
    scoresPerCategoryF1.append(localF1Score)
    categoryLabels.append(categoryId)
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerCategoryF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

CallToAction-Donations 0.36598639455782306
CallToAction-MovePeople 0.3075302790942601
CallToAction-Volunteer 0.20662768031189085
Other-Advice 0.35401157981803144
Other-ContextualInformation 0.031118794764139292
Other-Discussion 0.05097917314267951
Other-Irrelevant 0.5961237648315593
Other-Sentiment 0.32074484778742546
Report-CleanUp 0.18036529680365296
Report-EmergingThreats 0.28058877644894203
Report-Factoid 0.46172434625010783
Report-FirstPartyObservation 0.11279097672186224
Report-Hashtags 0.3121103594641199
Report-Location 0.5031787143866259
Report-MultimediaShare 0.3339574171637567
Report-News 0.30569524032276923
Report-NewSubEvent 0.05973813420621931
Report-Official 0.1371007371007371
Report-OriginalEvent 0.04454685099846391
Report-ServiceAvailable 0.351575456053068
Report-ThirdPartyObservation 0.19955127359113106
Report-Weather 0.5173819583425988
Request-GoodsServices 0.08677685950413222
Request-InformationWanted 0.2623762376237624
Request-SearchAndRescue 0.11857707509881422


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)

perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")

for eventId in eventIdentifiers:
    tavgPrecision = 0.0
    tavgRecall = 0.0
    tavgF1 = 0.0
    tavgAccuracy = 0.0

    categoryCount = 0
    
    for categoryId in informationTypes2Index.keys():
        if sum(event2groundtruth[eventId].get(categoryId)) == 0:
            continue
        
        categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
        
        tavgPrecision = tavgPrecision + categoryPrecision
        tavgRecall = tavgRecall + categoryRecall
        tavgF1 = tavgF1 + categoryF1
        tavgAccuracy = tavgAccuracy + categoryAccuracy
        
        categoryCount += 1
    
    if categoryCount == 0:
        print("No categories for event:", eventId)
        continue
    
    print(eventId)
    print("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
    print("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
    print("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
    print("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
    print("")
    
    perEventFile.write(eventId+"\n")
    perEventFile.write("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
    perEventFile.write("\n")
    
perEventFile.write("\n")

2020_01_27_houston_explosion.2020
  Information Type Precision (positive class, multi-type, macro): 0.204055537406687
  Information Type Recall (positive class, multi-type, macro): 0.3210144716177857
  Information Type F1 (positive class, multi-type, macro): 0.18936419659538598
  Information Type Accuracy (overall, multi-type, macro): 0.8869565217391304

2020_02_10_mideast_tornadoes.day1_mississipi.2020
  Information Type Precision (positive class, multi-type, macro): 0.49828467432272816
  Information Type Recall (positive class, multi-type, macro): 0.4532983008399848
  Information Type F1 (positive class, multi-type, macro): 0.4467533682971848
  Information Type Accuracy (overall, multi-type, macro): 0.8548654244306418

2020_02_10_mideast_tornadoes.day2_al.2020
  Information Type Precision (positive class, multi-type, macro): 0.23451920821025266
  Information Type Recall (positive class, multi-type, macro): 0.37318032667847345
  Information Type F1 (positive class, multi-type, macro): 0.24924511371677371
  Information Type Accuracy (overall, multi-type, macro): 0.9045637922339888

2020_02_10_mideast_tornadoes.day3_md.2019
  Information Type Precision (positive class, multi-type, macro): 0.14021368058070272
  Information Type Recall (positive class, multi-type, macro): 0.3578818927650236
  Information Type F1 (positive class, multi-type, macro): 0.14866488671948233
  Information Type Accuracy (overall, multi-type, macro): 0.8597727272727271

2020_05_06_tn_derecho.2020
  Information Type Precision (positive class, multi-type, macro): 0.262083412087746
  Information Type Recall (positive class, multi-type, macro): 0.28743792607418245
  Information Type F1 (positive class, multi-type, macro): 0.22830622456979008
  Information Type Accuracy (overall, multi-type, macro): 0.8869730123997084

brooklynblockparty_shooting.2019
  Information Type Precision (positive class, multi-type, macro): 0.1849548140674396
  Information Type Recall (positive class, multi-type, macro): 0.5215523539288426
  Information Type F1 (positive class, multi-type, macro): 0.1994132458779106
  Information Type Accuracy (overall, multi-type, macro): 0.904967387846007

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2016_puttingal_temple
  Information Type Precision (positive class, multi-type, macro): 0.19200162589055284
  Information Type Recall (positive class, multi-type, macro): 0.17840512683604223
  Information Type F1 (positive class, multi-type, macro): 0.15080794483820356
  Information Type Accuracy (overall, multi-type, macro): 0.8997964228588046

2017_12_04_thomas_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.29053720105892455
  Information Type Recall (positive class, multi-type, macro): 0.26377421260195727
  Information Type F1 (positive class, multi-type, macro): 0.2647530233269965
  Information Type Accuracy (overall, multi-type, macro): 0.8708711493872141

2017_12_07_lilac_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.3315237301602523
  Information Type Recall (positive class, multi-type, macro): 0.30947258218054563
  Information Type F1 (positive class, multi-type, macro): 0.28202537154286006
  Information Type Accuracy (overall, multi-type, macro): 0.8773770491803279

2018_07_23_klamathon_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.37602443169909827
  Information Type Recall (positive class, multi-type, macro): 0.28500461329767446
  Information Type F1 (positive class, multi-type, macro): 0.287830172745737
  Information Type Accuracy (overall, multi-type, macro): 0.8758823367848608

2018_08_05_holy_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.1442742018538389
  Information Type Recall (positive class, multi-type, macro): 0.315235194655036
  Information Type F1 (positive class, multi-type, macro): 0.1604442001757343
  Information Type Accuracy (overall, multi-type, macro): 0.9248314737331474

2018_11_07_Woolsey_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.14994282044555063
  Information Type Recall (positive class, multi-type, macro): 0.18077980347727104
  Information Type F1 (positive class, multi-type, macro): 0.14409704852065874
  Information Type Accuracy (overall, multi-type, macro): 0.889176235302231

2018_maryland_flood
  Information Type Precision (positive class, multi-type, macro): 0.2890599571849312
  Information Type Recall (positive class, multi-type, macro): 0.35828142264177787
  Information Type F1 (positive class, multi-type, macro): 0.2889363265039347
  Information Type Accuracy (overall, multi-type, macro): 0.870989255279733

2018_pittsburgh_synagogue_shooting
  Information Type Precision (positive class, multi-type, macro): 0.38269796573291887
  Information Type Recall (positive class, multi-type, macro): 0.37861119945077965
  Information Type F1 (positive class, multi-type, macro): 0.36519659440551944
  Information Type Accuracy (overall, multi-type, macro): 0.7585470085470085

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_03_01_alberta_wildfire.2019.v2
  Information Type Precision (positive class, multi-type, macro): 0.09654626897370239
  Information Type Recall (positive class, multi-type, macro): 0.21004749711822604
  Information Type F1 (positive class, multi-type, macro): 0.06646235226323287
  Information Type Accuracy (overall, multi-type, macro): 0.8445305770887166

2019_08_25_hurricane_dorian.2019
  Information Type Precision (positive class, multi-type, macro): 0.24359516376996557
  Information Type Recall (positive class, multi-type, macro): 0.21756261164054613
  Information Type F1 (positive class, multi-type, macro): 0.1583621699056024
  Information Type Accuracy (overall, multi-type, macro): 0.872609776304888

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_10_10_saddleridge_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.2402787803843353
  Information Type Recall (positive class, multi-type, macro): 0.19828408163863132
  Information Type F1 (positive class, multi-type, macro): 0.20236040475521797
  Information Type Accuracy (overall, multi-type, macro): 0.9177093108122198

2019_10_25_kincade_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.30803828740069045
  Information Type Recall (positive class, multi-type, macro): 0.32259806643518274
  Information Type F1 (positive class, multi-type, macro): 0.2922787310396954
  Information Type Accuracy (overall, multi-type, macro): 0.9021026626406808

2019_durham_gas_explosion
  Information Type Precision (positive class, multi-type, macro): 0.2708436360893993
  Information Type Recall (positive class, multi-type, macro): 0.3422003180855385
  Information Type F1 (positive class, multi-type, macro): 0.2645028770611925
  Information Type Accuracy (overall, multi-type, macro): 0.8879892246717861

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_saugus_high_school_shooting
  Information Type Precision (positive class, multi-type, macro): 0.21224345239548537
  Information Type Recall (positive class, multi-type, macro): 0.23083768041753483
  Information Type F1 (positive class, multi-type, macro): 0.19356833564227258
  Information Type Accuracy (overall, multi-type, macro): 0.8977586351091413

2019_townsville_flood
  Information Type Precision (positive class, multi-type, macro): 0.31108754584759996
  Information Type Recall (positive class, multi-type, macro): 0.27241188523193804
  Information Type F1 (positive class, multi-type, macro): 0.26379223905651566
  Information Type Accuracy (overall, multi-type, macro): 0.8858574016838962

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)

N = len(eventIdentifiers)
ind = np.arange(N)

scoresPerEventF1 = []
for eventId in eventIdentifiers:
    avgF1_ = 0.0
    
    for categoryId in informationTypes2Index.keys():
        avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        
    scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerEventF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)

from sklearn.metrics import mean_squared_error

priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]

    f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    priorityAvgf1 = priorityAvgf1 + f1;
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgf1High = priorityAvgf1High + f1
    else:
        priorityAvgf1Low = priorityAvgf1Low + f1
    
    
    
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
    
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")

Priority Label Prediction (F1, macro): 0.22878279179769898

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases

priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
    if categoryId == "Other-Irrelevant":
        continue
        
    groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
    predictedPriorities = category2PredictedPriorityScore[categoryId]

    # Pathological case when no variation exists in the predictions needs to be handled
    this_corr = 0.0
    if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
        this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
    priorityAvgCorr = priorityAvgCorr + this_corr
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
    else:
        priorityAvgCorrLow = priorityAvgCorrLow + this_corr
    
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))


resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")

Priority Score Prediction (Pearson): 0.12757159682294006
Priority Score Prediction, High (Pearson): 0.10584417475436199
Priority Score Prediction, Low (Pearson): 0.13481407084579938

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)

N = len(informationTypes2Index)
ind = np.arange(N)

priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]
    priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    if (math.isnan(priorityCatF1)):
        priorityCatF1 = 0.0
    categoryLabels.append(categoryId)
    priorityCatF1Values.append(priorityCatF1);
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, priorityCatF1Values, width)

plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()


resultLine = None

# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")

resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
     " & "+
     str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgF1/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
     " & "+
     str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
     " \\\\")

print(runName+" & "+resultLine)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")

Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\
njit_label_prop & 0.4215 & 0.2008 & 0.2600 & 0.8964 & 0.2268 & 0.2288 & 0.1058 & 0.1225 \\

91


# Done
resultsFile.close() 
perTopicFile.close()
perEventFile.close()


# header = [
#     "Run",
#     "date",
#     "team",
#     "description",
#     "paper",
#     "code",
#     "nDCG@100",
#     "Info-Type F1 [Actionable]",
#     "Info-Type F1 [All]",
#     "Info-Type Accuracy",
#     "Priority F1 [Actionable]",
#     "Priority F1 [All]",
#     "Priority R [Actionable]",
#     "Priority R [All]",
# ]

import csv
if os.path.isfile("metadata.json"):
    this_cwd = os.getcwd()
    sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
    sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
    
    leaderboard_entry = None
    with open("metadata.json", "r") as in_file:
        
        metadata = json.load(in_file)
        
        leaderboard_entry = [
            runName,
            sub_date,
            metadata["organization"].lower(),
            metadata["model_description"],
            metadata["paper"] if metadata["paper"].startswith("http") else "",
            metadata["code"] if metadata["code"].startswith("http") else "",
            str.format('{0:.4f}',system_ndcg_micro),
            str.format('{0:.4f}',avgF1High/numHighInformationTypes),
            str.format('{0:.4f}',avgF1/numInformationTypes),
            str.format('{0:.4f}',avgAccuracy/numInformationTypes),
            str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
            str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
        ]
        
    with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
        leader_writer = csv.writer(csvResultsFile)
        leader_writer.writerow(leaderboard_entry)