# --------------------------------------------------
# TREC IS 2021b Evaluation Script
# Configured for 2021-B Events
# Used to evaluate TREC-IS runs
# --------------------------------------------------
version = 3.0 # Notebook Version Number
edition = "2021b.all"

import os
cwd = os.getcwd()


# Configuration Information

# Do we try and normalize the run priority scores?
enablePriorityNorm = True

# Score threshold
enableCategoryNorm = True
defaultScoreThreshold = 0.5

taskCategories = [
    "CallToAction-Donations",
    "CallToAction-MovePeople",
    "CallToAction-Volunteer",
    "Other-Advice",
    "Other-ContextualInformation",
    "Other-Discussion",
    "Other-Irrelevant",
    "Other-Sentiment",
    "Report-CleanUp",
    "Report-EmergingThreats",
    "Report-Factoid",
    "Report-FirstPartyObservation",
    "Report-Hashtags",
    "Report-Location",
    "Report-MultimediaShare",
    "Report-News",
    "Report-NewSubEvent",
    "Report-Official",
    "Report-OriginalEvent",
    "Report-ServiceAvailable",
    "Report-ThirdPartyObservation",
    "Report-Weather",
    "Request-GoodsServices",
    "Request-InformationWanted",
    "Request-SearchAndRescue",
]

# What we consider to be highly important categories of information
highImportCategories = [
    "Request-GoodsServices",
    "Request-SearchAndRescue",
    "CallToAction-MovePeople",
    "Report-EmergingThreats",
    "Report-NewSubEvent",
    "Report-ServiceAvailable"
]

highImportCategoriesShort = [
    "GoodsServices",
    "SearchAndRescue",
    "MovePeople",
    "EmergingThreats",
    "NewSubEvent",
    "ServiceAvailable"
]

# Priority map
priorityScoreMap = {
    "Critical": 1.0,
    "High": 0.75,
    "Medium": 0.5,
    "Low": 0.25,
    "Unknown": 0.25,
}

# Parameters
var_lambda = 0.75 # weight to place on actionable information categories in comparison to non actionable categoriee
var_alpha = 0.3 # Flat gain for providing a correct alert, regardless of the categories selected


# Events with no data, so we should skip them
#. Updated from 2021a and 2021b, so we use *all* data
skipEvents = [
#     '2015_09_28_hurricane_joaquin.2015',
#     '2017_03_23_cyclone_debbie.2017',
#     '2018_02_24_anticyclone_hartmut.2018',
#     '2018_07_13_ferguson_wildfire.2018',
#     '2018_07_23_cranston_wildfire.2018',
#     '2018_09_07_hurricane_florence.2018',
#     '2018_10_07_hurricane_michael.2018',
#     '2019_09_17_tropicalstorm_imelda.2019',
#     '2019_karnataka_floods',
#     '2019_spring_floods_in_ontario_quebec_and_new_brunswick',
#     '2020_01_28_bar_shooting_nc.2020',
#     '2020_02_07_rutherford_tn_floods.2020',
#     '2020_05_26_edenville_dam_failure.2020.corrected',
#     '2020_08_27_hurricane_laura.2020',
#     '2020_09_11_hurricane_sally.2020',
#     '2020_afghanistan_flood',
#     '2020_hpakant_jade_mine_disaster',
#     '2020_kerala_floods',
#     'T2020_02_03_texas_university_shooting.2020',
#     'UNASSIGNED',
#     'indonesia_earthquake.2019'
    
    "2020_05_26_edenville_dam_failure.2020.corrected",
    "2018_10_07_hurricane_michael.2018",
    "2020_01_28_bar_shooting_nc.2020",
    "T2020_02_03_texas_university_shooting.2020",
    "2020_02_07_rutherford_tn_floods.2020",
    "UNASSIGNED",
    "indonesia_earthquake.2019",
    "2015_09_28_hurricane_joaquin.2015",
    "2017_03_23_cyclone_debbie.2017",
    "2018_02_24_anticyclone_hartmut.2018",
    "2018_07_13_ferguson_wildfire.2018",
    "2018_07_23_cranston_wildfire.2018",
    "2018_09_07_hurricane_florence.2018",
    "2019_09_17_tropicalstorm_imelda.2019",
    "2019_karnataka_floods",
    "2019_spring_floods_in_ontario_quebec_and_new_brunswick",
    "2020_08_27_hurricane_laura.2020",
    "2020_09_11_hurricane_sally.2020",
    "2020_afghanistan_flood",
    "2020_hpakant_jade_mine_disaster",
    "2020_kerala_floods",
]


import glob

runFile = None
for f in glob.glob("*.gz"):
    runFile = f

print("Run File:", f)

Run File: run.json.gz


import gzip
import json


runName = None

with gzip.open(runFile, "r") as inRunFile:
    for line in inRunFile:
        line = line.decode("utf8")
#         runName = line.rpartition("\t")[2].strip()
        runName = json.loads(line)["runtag"]
        break

print("Run Name:", runName)

Run Name: ens


# Do we try and normalize the run priority scores?
enablePriorityNorm = False

dataDir = "../../data/2021b"

# The location of the topics file
topicsFile = "%s/2021a.topics" % dataDir

# The location of the ground truth data against which to compare the run
classificationLabelFiles = [
#     "%s/TRECIS-2021A-crisis.labels.prelim.json" % dataDir,
#     "%s/TRECIS-2021A-crisis.labels.prelim.pt2.json" % dataDir,
#     "%s/TRECIS-crisis.labels.2021b.json" % dataDir,
    "%s/TRECIS-crisis.labels.2021.all.json" % dataDir,
]

# The location of the ontology file
ontologyFile = "%s/TRECIS-2021A-ITypes.json" % dataDir


topicArray = []

with open(topicsFile, "r") as inTopicsFile:
    
    topicNum = None
    topicDataset = None
    
    for line_ in inTopicsFile:
        line = line_.strip()
        
        if line == "</top>":
            if topicDataset in skipEvents:
                continue
            topicArray.append((topicDataset, topicNum))
            
        if line.startswith("<num>"):
            topicNum = line.partition("<num>")[2].partition("</num>")[0]
            
        if line.startswith("<dataset>"):
            topicDataset = line.partition("<dataset>")[2].partition("</dataset>")[0]
            
for row in topicArray:
    print(row)

('2020_01_27_houston_explosion.2020', 'TRECIS-CTIT-H-076')
('2020_02_10_mideast_tornadoes.day1_mississipi.2020', 'TRECIS-CTIT-H-080')
('2020_02_10_mideast_tornadoes.day2_al.2020', 'TRECIS-CTIT-H-081')
('2020_02_10_mideast_tornadoes.day3_md.2019', 'TRECIS-CTIT-H-082')
('2020_05_06_tn_derecho.2020', 'TRECIS-CTIT-H-083')
('brooklynblockparty_shooting.2019', 'TRECIS-CTIT-H-085')
('2016_puttingal_temple', 'TRECIS-CTIT-H-089')
('2017_12_04_thomas_wildfire.2017', 'TRECIS-CTIT-H-091')
('2017_12_07_lilac_wildfire.2017', 'TRECIS-CTIT-H-092')
('2018_07_23_klamathon_wildfire.2018', 'TRECIS-CTIT-H-096')
('2018_08_05_holy_wildfire.2018', 'TRECIS-CTIT-H-097')
('2018_11_07_Woolsey_wildfire.2018', 'TRECIS-CTIT-H-100')
('2018_maryland_flood', 'TRECIS-CTIT-H-101')
('2018_pittsburgh_synagogue_shooting', 'TRECIS-CTIT-H-102')
('2019_03_01_alberta_wildfire.2019.v2', 'TRECIS-CTIT-H-103')
('2019_08_25_hurricane_dorian.2019', 'TRECIS-CTIT-H-104')
('2019_10_10_saddleridge_wildfire.2019', 'TRECIS-CTIT-H-106')
('2019_10_25_kincade_wildfire.2019', 'TRECIS-CTIT-H-107')
('2019_durham_gas_explosion', 'TRECIS-CTIT-H-108')
('2019_saugus_high_school_shooting', 'TRECIS-CTIT-H-110')
('2019_townsville_flood', 'TRECIS-CTIT-H-112')
('2020_easter_tornado_outbreak', 'TRECIS-CTIT-H-116')
('2020_tornado_outbreak_of_april', 'TRECIS-CTIT-H-119')
('2020_tornado_outbreak_of_march', 'TRECIS-CTIT-H-120')
('2020_visakhapatnam_gas_leak', 'TRECIS-CTIT-H-121')
('tornado_outbreak_of_november_30_december_2018', 'TRECIS-CTIT-H-122')


# --------------------------------------------------
# Static data for the 2021 edition
# --------------------------------------------------
# Identifiers for the test events
eventidTopicidMap = dict(topicArray)
eventIdentifiers = list(eventidTopicidMap.keys())

resultsFile = open(runName+".results.v"+str(version)+"."+edition+".overall.txt","w+")
resultsFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
resultsFile.write("Run: "+runName+" ("+runFile+")"+"\n")
resultsFile.write(""+"\n")

perTopicFile = open(runName+".results.v"+str(version)+"."+edition+".pertopic.txt","w+")
perTopicFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perTopicFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perTopicFile.write(""+"\n")

perEventFile = open(runName+".results.v"+str(version)+"."+edition+".perevent.txt","w+")
perEventFile.write("TREC-IS "+edition+" Notebook Evaluator v"+str(version)+"\n")
perEventFile.write("Run: "+runName+" ("+runFile+")"+"\n")
perEventFile.write(""+"\n")

1


# --------------------------------------------------
# Processing Starts Here
# --------------------------------------------------
import json
import gzip
import math
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

# --------------------------------------------------
# Stage 1: Load the ground truth dataset 
# --------------------------------------------------

groundtruthJSON = []
for groundtruthFile in classificationLabelFiles:
    print("Reading "+groundtruthFile)
    with open(groundtruthFile, encoding='iso-8859-1') as groundtruthJSONFile:    
        groundtruthJSON.append(json.load(groundtruthJSONFile))
#pprint(groundtruthJSON["events"])

# --------------------------------------------------
# Stage 2: Load run file 
# --------------------------------------------------
with gzip.open(runFile, "r") as openRunFile:
#     runContents = [line.decode("utf8") for line in openRunFile.readlines()] # lines not yet decoded
    runContents = [json.loads(line.decode("utf8")) for line in openRunFile.readlines()] # lines not yet decoded
#pprint(runContents[0])

Reading ../../data/2021b/TRECIS-crisis.labels.2021.all.json


# --------------------------------------------------
# Stage 3: Load the categories 
# --------------------------------------------------
with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:    
    ontologyJSON = json.load(ontologyJSONFile)

informationTypes2Index = {} # category -> numerical index
informationTypesShort2Index = {} # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index

for informationTypeJSON in ontologyJSON["informationTypes"]:
    informationTypeId = informationTypeJSON["id"]
    
    informationTypeIndex = taskCategories.index(informationTypeId)
    informationTypes2Index[informationTypeId] = informationTypeIndex
    informationTypesShort2Index[informationTypeId.split("-")[1]] = informationTypeIndex


# -----------------------------------------------------------
# Stage 4: Produce ground truth maps between tweetIds and categories
# -----------------------------------------------------------
# Notes: Ground truth is used as a base, if a run includes tweets
#        not in the ground truth they will be ignored
# Assumptions: A tweet will not be returned for multiple events

tweetId2TRECInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECHighImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECLowImportInfoCategories = {} # tweet id -> Array of categories selected by assessors
tweetId2TRECPriorityCategory = {} # tweet id -> priority label (Critical,High,Medium,Low)
index2TweetId = {} # ordered tweets
event2tweetIds = {} # event -> tweet ids for tweets within that event
countHighCriticalImport = 0
countLowMediumImport = 0
tweetsSeen = []


invertedPriorityScoreMap = {
    v:k for k,v in priorityScoreMap.items()
}

tweetIndex = 0
for groundtruth in groundtruthJSON:
    for eventJSON in groundtruth["events"]:
        eventid = eventJSON["eventid"]
        print(eventid)
        
        if eventid in skipEvents:
            continue
        
        if not event2tweetIds.get(eventid):
            event2tweetIds[eventid] = []
        
        if any(eventid in s for s in eventIdentifiers):
            # iterate over tweets in the event
            for tweetJSON in eventJSON["tweets"]:
                tweetid = tweetJSON["postID"]
                categories = tweetJSON["postCategories"]
                priority = tweetJSON["postPriority"]
                
                if priority == "High" or priority == "Critical":
                    countHighCriticalImport = countHighCriticalImport + 1
                
                if priority == "Low" or priority == "Medium":
                    countLowMediumImport = countLowMediumImport + 1
                
                # check categories for name issues and correct if possible
                cleanedCategories = []
                highImportCats = []
                lowImportCats = []
                for categoryId in categories:
                    if not any(categoryId in s for s in informationTypesShort2Index.keys()):
#                         print("Found unknown category in ground truth "+categoryId+", ignoring...")
                        pass
                    else:
                        cleanedCategories.append(categoryId)
                        if any(categoryId in s for s in highImportCategoriesShort):
                            highImportCats.append(categoryId)
                        else:
                            lowImportCats.append(categoryId)
    
                if tweetid not in tweetsSeen:
                    event2tweetIds[eventid].append(tweetid)
                    tweetId2TRECInfoCategories[tweetid] = cleanedCategories
                    tweetId2TRECHighImportInfoCategories[tweetid] = highImportCats
                    tweetId2TRECLowImportInfoCategories[tweetid] = lowImportCats
                    tweetId2TRECPriorityCategory[tweetid] = priority
                    index2TweetId[tweetIndex] = tweetid;
                    tweetIndex = tweetIndex + 1
                    tweetsSeen.append(tweetid)

                else:
                    tweetId2TRECInfoCategories[tweetid] = list(set(
                        cleanedCategories + tweetId2TRECInfoCategories[tweetid]
                    ))
                    
                    prePriorityScore = priorityScoreMap[tweetId2TRECPriorityCategory[tweetid]]
                    thisPriorityScore = priorityScoreMap[priority]
                    
                    tweetId2TRECPriorityCategory[tweetid] = invertedPriorityScoreMap[
                        max(prePriorityScore, thisPriorityScore)
                    ]

                
        else:
            print("WARN: Found ground truth data for event not in the topic set "+eventid+", ignoring...")

2020_01_27_houston_explosion.2020
2020_01_28_bar_shooting_nc.2020
T2020_02_03_texas_university_shooting.2020
2020_02_07_rutherford_tn_floods.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
2020_05_26_edenville_dam_failure.2020.corrected
brooklynblockparty_shooting.2019
UNASSIGNED
indonesia_earthquake.2019
2015_09_28_hurricane_joaquin.2015
2016_puttingal_temple
2017_03_23_cyclone_debbie.2017
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_02_24_anticyclone_hartmut.2018
2018_07_13_ferguson_wildfire.2018
2018_07_23_cranston_wildfire.2018
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_09_07_hurricane_florence.2018
2018_10_07_hurricane_michael.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_09_17_tropicalstorm_imelda.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_karnataka_floods
2019_saugus_high_school_shooting
2019_spring_floods_in_ontario_quebec_and_new_brunswick
2019_townsville_flood
2020_08_27_hurricane_laura.2020
2020_09_11_hurricane_sally.2020
2020_afghanistan_flood
2020_easter_tornado_outbreak
2020_hpakant_jade_mine_disaster
2020_kerala_floods
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 5: Produce run predicted maps between tweetIds and categories
# -----------------------------------------------------------
tweetId2RunInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunHighImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunLowImportInfoCategories = {} # tweet id -> predicted category by participant system
tweetId2RunInfoCategoriesProb = {} # tweet id -> predicted category probability by participant system
tweetId2RunInfoCategoriesProbNorm = {} # tweet id -> predicted category probability by participant system
tweetId2RunPriorityScore = {} # tweet id -> importance score from participant system
tweetId2RunPriorityCategory = {} # tweet id -> importance category (Critical, High, Medium Low)
tweetId2RunPriorityScoreNorm = {} # tweet id -> importance score from participant system
event2TweetIdRank = {} # event -> (rank,tweetid)

maxPrediction = -999999
minPrediction = 999999
maxCategory = -999999
minCategory = 999999

for predictionParts in runContents:
    
    #print(runLine)
    if (len(predictionParts)<6 ):
        print(runLine)
        continue
    else:
        eventId = predictionParts["topic"]
        
        if eventId in skipEvents:
            continue
        
        tweetId = predictionParts["tweet_id"]
        rank = 0
        #print(predictionParts[5])

        category_scores = predictionParts["info_type_scores"]
        category_labels = predictionParts["info_type_labels"]

        priority = float(predictionParts["priority"])
        
        if priority > maxPrediction:
            maxPrediction = priority
        if priority < minPrediction:
            minPrediction = priority
        
        cleanedCategories = []
        cleanedCategoriesProbs = []
        highImportCats = []
        lowImportCats = []
        
        # Handle category flags
        for catIndex, categoryLabel in enumerate(category_labels):
            # check if we have a binary flag for this label
            if categoryLabel == 0:
                # False flag, so skip
                continue
                
            categoryId = taskCategories[catIndex]
            
            if not any(categoryId in s for s in informationTypes2Index.keys()):
                print("Found unknown category in run "+categoryId+", ignoring...")
            else:
                cleanedCategories.append(categoryId)
                if any(categoryId in s for s in highImportCategories):
                    highImportCats.append(categoryId)
                else:
                    lowImportCats.append(categoryId)
                    
        # Process category probabilities
        for categoryProbability in category_scores:
            
            if categoryProbability > maxCategory:
                maxCategory = categoryProbability
            if categoryProbability < minCategory:
                minCategory = categoryProbability
            
            cleanedCategoriesProbs.append(categoryProbability)
                
        tweetId2RunHighImportInfoCategories[tweetId] = highImportCats
        tweetId2RunLowImportInfoCategories[tweetId] = lowImportCats
        tweetId2RunInfoCategories[tweetId] = cleanedCategories
        tweetId2RunInfoCategoriesProb[tweetId] = cleanedCategoriesProbs
        tweetId2RunPriorityScore[tweetId] = priority
        
        if priority > priorityScoreMap["High"]:
            tweetId2RunPriorityCategory[tweetId] = "Critical"
        elif priority > priorityScoreMap["Medium"]:
            tweetId2RunPriorityCategory[tweetId] = "High"
        elif priority > priorityScoreMap["Low"]:
            tweetId2RunPriorityCategory[tweetId] = "Medium"
        else:
            tweetId2RunPriorityCategory[tweetId] = "Low"
        
        if not event2TweetIdRank.get(eventId):
            event2TweetIdRank[eventId] = []
        rankTuple = (tweetId,rank)
        event2TweetIdRank.get(eventId).append(rankTuple)


for eventId in event2TweetIdRank.keys():
    tweetsSorted = sorted(event2TweetIdRank.get(eventId), key=lambda tup: tup[1])
    event2TweetIdRank[eventId] = tweetsSorted
    
for i in range(len(index2TweetId)):
    tweetId = index2TweetId[i]
    if tweetId2RunPriorityScore.get(tweetId):
        
        if enablePriorityNorm:
            if (minPrediction-minPrediction) == 0.0:
                tweetId2RunPriorityScoreNorm[tweetId] = 0.0
            else:
                tweetId2RunPriorityScoreNorm[tweetId] = (tweetId2RunPriorityScore.get(tweetId)-minPrediction)/(maxPrediction-minPrediction)
        else:
            tweetId2RunPriorityScoreNorm[tweetId] = tweetId2RunPriorityScore.get(tweetId)
    else:
        tweetId2RunPriorityScoreNorm[tweetId] = 0.0


# --------------------------------------------------
# Stage 6: Create ground truth vectors per category
# --------------------------------------------------

category2GroundTruth = {} # category -> tweet vector with binary 1 vs all ground truth category labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        #pprint(categories)
        if any(categoryIdShort in s for s in categories):
            categoryVector.append(1)
        else:
            categoryVector.append(0)
    category2GroundTruth[categoryId] = categoryVector
            
#pprint(category2GroundTruth)


# --------------------------------------------------
# Stage 7: Create run vectors per category 
# --------------------------------------------------
# Assumptions: If run misses a tweet, we assume it has
#              no categories
category2Predicted = {} # category -> tweet vector with binary 1 vs all predicted by system labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        
        if tweetId2RunInfoCategories.get(tweetId):
            categories = tweetId2RunInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
        else:
            categoryVector.append(0)

    category2Predicted[categoryId] = categoryVector

#pprint(category2Predicted)


# --------------------------------------------------
# Stage 8: Make event category vectors 
# --------------------------------------------------

event2groundtruth = {} # event -> category -> tweet vector with binary 1 vs all ground truth category labels
for eventId in eventIdentifiers:
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(eventId)
        for tweetId in event2tweetIds.get(eventId):
#             print(tweetId)
            categories = tweetId2TRECInfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2groundtruth[eventId] = eventCategories
    

event2prediction = {} # event -> category -> tweet vector with binary 1 vs all predicted by system labels
for eventId in eventIdentifiers:
    print(eventId)
    eventCategories = {}
    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
#         print(tweetId)
        for tweetId in event2tweetIds.get(eventId):
            #print(tweetId)
            categories = tweetId2RunInfoCategories.get(tweetId)
            
            if categories == None:
                categories = json.loads("[]")
                tweetId2RunInfoCategories[tweetId] = categories
            
            if any(categoryId in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
            
        eventCategories[categoryId] = categoryVector
    event2prediction[eventId] = eventCategories

2020_01_27_houston_explosion.2020
2020_02_10_mideast_tornadoes.day1_mississipi.2020
2020_02_10_mideast_tornadoes.day2_al.2020
2020_02_10_mideast_tornadoes.day3_md.2019
2020_05_06_tn_derecho.2020
brooklynblockparty_shooting.2019
2016_puttingal_temple
2017_12_04_thomas_wildfire.2017
2017_12_07_lilac_wildfire.2017
2018_07_23_klamathon_wildfire.2018
2018_08_05_holy_wildfire.2018
2018_11_07_Woolsey_wildfire.2018
2018_maryland_flood
2018_pittsburgh_synagogue_shooting
2019_03_01_alberta_wildfire.2019.v2
2019_08_25_hurricane_dorian.2019
2019_10_10_saddleridge_wildfire.2019
2019_10_25_kincade_wildfire.2019
2019_durham_gas_explosion
2019_saugus_high_school_shooting
2019_townsville_flood
2020_easter_tornado_outbreak
2020_tornado_outbreak_of_april
2020_tornado_outbreak_of_march
2020_visakhapatnam_gas_leak
tornado_outbreak_of_november_30_december_2018


# -----------------------------------------------------------
# Stage 9: Make priority classification vectors
# -----------------------------------------------------------

category2GroundTruthPriority = {} # category -> tweet vector with binary 1 vs all ground truth priority labels

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    priorityVector = []
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            priority = tweetId2TRECPriorityCategory.get(tweetId)
            priorityVector.append(priority)
    category2GroundTruthPriority[categoryId] = priorityVector

category2PredictedPriority = {} # category -> tweet vector with binary 1 vs all predicted by system labels
category2PredictedPriorityScore = {} # Category -> tweet vector with priority scores

for categoryId in informationTypes2Index.keys():
    categoryIdShort = categoryId.split("-")[1]
    categoryVector = []
    categoryScoreVector = []
    
    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]
        categories = tweetId2TRECInfoCategories.get(tweetId)
        if any(categoryIdShort in s for s in categories):
            if tweetId2RunPriorityCategory.get(tweetId):
                priority = tweetId2RunPriorityCategory.get(tweetId)
                priorityScore = tweetId2RunPriorityScore.get(tweetId)
            
                categoryVector.append(priority)
                categoryScoreVector.append(priorityScore)
            else:
                categoryVector.append("Low") # default to low priority
                categoryScoreVector.append(0.25)

    category2PredictedPriority[categoryId] = categoryVector
    category2PredictedPriorityScore[categoryId] = categoryScoreVector


# --------------------------------------------------
# Disable Warnings (comment this out when debugging!)
# --------------------------------------------------
import warnings
# warnings.filterwarnings("ignore") # ignore warnings about 0-score categories


# --------------------------------------------------
# TREC-IS 2021A
# Priority-Centric Discounted Cumulative Gain
# --------------------------------------------------

import pandas as pd

def calc_dcg(scores, at_k=100):
    position = 1
    accumulator = 0.0
    for score in scores[:at_k]:

        numerator = 2 ** score - 1
        denom = np.log2(position + 1)

        accumulator += numerator / denom
        position += 1

    return accumulator

priority_map = {
    "Unknown": 1,
    "Low": 1,
    "Medium": 2,
    "High": 3,
    "Critical": 4,
}

at_k = 100

tweetId2TRECPriorityCategory_score = {
    k:priority_map[v] for k,v in tweetId2TRECPriorityCategory.items()
}
tweetId2TRECPriorityCategory_scores_sorted = sorted(
    tweetId2TRECPriorityCategory_score.values(),
    reverse=True
)

best_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print(event)
    
    tweetId2TRECPriorityCategory_scores_sorted = sorted(
        [tweetId2TRECPriorityCategory_score[x] for x in rel_tweets],
        reverse=True
    )
    ideal_dcg = calc_dcg(tweetId2TRECPriorityCategory_scores_sorted, at_k)
    print("\tBest DCG:", ideal_dcg)
    best_dcg_per_event[event] = ideal_dcg
    
print("Mean:", np.mean(list(best_dcg_per_event.values())))
print()

# Code below calculates the DCG for a system's 
#  ranked priority tweets. We have to do some 
#  sampling here to break ties among tweets with
#  the same priority scores.

# Build a dataframe from the system's provided
#  priority scores, so we can identify what the
#  top-most priorities are and get a count of
#  the number of tweets in each priority bin.
priority_df = pd.DataFrame(
    [(k, priority_map[v]) for k, v in tweetId2RunPriorityCategory.items()],
    columns=["tweet_id", "priority"]
)

# Build metrics for each event
system_dcg_per_event = {}
for event, rel_tweets in event2tweetIds.items():
    print("Event:", event)
    local_priority_df = priority_df[priority_df["tweet_id"].isin(set(rel_tweets))]
    
    unique_scores = local_priority_df["priority"].value_counts()
    
    # Find the top priority scores that would be included
    #  in the necessary at_k values.
    total = 0
    top_keys = []
    candidates = {}
    for top in sorted(unique_scores.index, reverse=True):

        # We store this key, so we can go back and shuffle
        #. tweets with this score.
        top_keys.append(top)
        local_restricted_df = local_priority_df[local_priority_df["priority"] == top]
        candidates[top] = list(local_restricted_df["tweet_id"])

        total += local_restricted_df.shape[0]

        # Once we have enough samples, stop.
        if ( total > at_k ):
            break

    # Now we generate distribution over the DCG for this
    #  system and do this a number of times to remove
    #  dependence on our selection of the top k tweets
    random_dcgs = []
    for i in range(100):

        local_tweet_ids = []
        for top in top_keys:
            this_top_tweets = candidates[top][:]
            np.random.shuffle(this_top_tweets)

            needed = at_k - len(local_tweet_ids)
            local_tweet_ids.extend(this_top_tweets[:needed])

        local_scores = [tweetId2TRECPriorityCategory_score[x] for x in local_tweet_ids]

        random_dcgs.append(calc_dcg(local_scores))

    system_dcg = np.mean(random_dcgs)

    system_ndcg_ = system_dcg / best_dcg_per_event[event]
    print("\tnDCG:", system_ndcg_)
    system_dcg_per_event[event] = system_ndcg_
    
print()
system_ndcg_micro = np.mean(list(system_dcg_per_event.values()))
print("System Event-Micro nDCG:", system_ndcg_micro)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: nDCG and Priority"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> nDCG:"+"\t"+str(system_ndcg_micro)+"\n")
resultsFile.write(""+"\n")

2020_01_27_houston_explosion.2020
	Best DCG: 176.99559032459564
2020_02_10_mideast_tornadoes.day1_mississipi.2020
	Best DCG: 268.88459894996123
2020_02_10_mideast_tornadoes.day2_al.2020
	Best DCG: 270.1716952398847
2020_02_10_mideast_tornadoes.day3_md.2019
	Best DCG: 135.38775246204446
2020_05_06_tn_derecho.2020
	Best DCG: 167.06354661312534
brooklynblockparty_shooting.2019
	Best DCG: 179.1756130795261
2016_puttingal_temple
	Best DCG: 314.08006311421406
2017_12_04_thomas_wildfire.2017
	Best DCG: 300.71399384300895
2017_12_07_lilac_wildfire.2017
	Best DCG: 314.08006311421406
2018_07_23_klamathon_wildfire.2018
	Best DCG: 221.46334445469358
2018_08_05_holy_wildfire.2018
	Best DCG: 153.96993418707177
2018_11_07_Woolsey_wildfire.2018
	Best DCG: 175.67469323453255
2018_maryland_flood
	Best DCG: 285.7119531591263
2018_pittsburgh_synagogue_shooting
	Best DCG: 111.85075929877581
2019_03_01_alberta_wildfire.2019.v2
	Best DCG: 62.88708564345522
2019_08_25_hurricane_dorian.2019
	Best DCG: 146.57069611996656
2019_10_10_saddleridge_wildfire.2019
	Best DCG: 173.00802656786584
2019_10_25_kincade_wildfire.2019
	Best DCG: 314.08006311421406
2019_durham_gas_explosion
	Best DCG: 201.07148118577902
2019_saugus_high_school_shooting
	Best DCG: 314.08006311421406
2019_townsville_flood
	Best DCG: 314.08006311421406
2020_easter_tornado_outbreak
	Best DCG: 214.9714167256293
2020_tornado_outbreak_of_april
	Best DCG: 314.08006311421406
2020_tornado_outbreak_of_march
	Best DCG: 267.51977363880474
2020_visakhapatnam_gas_leak
	Best DCG: 314.08006311421406
tornado_outbreak_of_november_30_december_2018
	Best DCG: 314.08006311421406
Mean: 231.7589407554446

Event: 2020_01_27_houston_explosion.2020
	nDCG: 0.24705265665210815
Event: 2020_02_10_mideast_tornadoes.day1_mississipi.2020
	nDCG: 0.41181498699377
Event: 2020_02_10_mideast_tornadoes.day2_al.2020
	nDCG: 0.4635021433763508
Event: 2020_02_10_mideast_tornadoes.day3_md.2019
	nDCG: 0.35573799327240435
Event: 2020_05_06_tn_derecho.2020
	nDCG: 0.5905942210943509
Event: brooklynblockparty_shooting.2019
	nDCG: 0.14455035294463595
Event: 2016_puttingal_temple
	nDCG: 0.29964101255295456
Event: 2017_12_04_thomas_wildfire.2017
	nDCG: 0.40168381636144224
Event: 2017_12_07_lilac_wildfire.2017
	nDCG: 0.37507821937955427
Event: 2018_07_23_klamathon_wildfire.2018
	nDCG: 0.5582142123214363
Event: 2018_08_05_holy_wildfire.2018
	nDCG: 0.41724774420245725
Event: 2018_11_07_Woolsey_wildfire.2018
	nDCG: 0.35425413701660324
Event: 2018_maryland_flood
	nDCG: 0.3567802480754506
Event: 2018_pittsburgh_synagogue_shooting
	nDCG: 0.9038488929651307
Event: 2019_03_01_alberta_wildfire.2019.v2
	nDCG: 0.332956610407976
Event: 2019_08_25_hurricane_dorian.2019
	nDCG: 0.38732858346211035
Event: 2019_10_10_saddleridge_wildfire.2019
	nDCG: 0.5684541510465869
Event: 2019_10_25_kincade_wildfire.2019
	nDCG: 0.590088706667267
Event: 2019_durham_gas_explosion
	nDCG: 0.22110283869962846
Event: 2019_saugus_high_school_shooting
	nDCG: 0.4479446140589797
Event: 2019_townsville_flood
	nDCG: 0.6733553385045296
Event: 2020_easter_tornado_outbreak
	nDCG: 0.5258689964592105
Event: 2020_tornado_outbreak_of_april
	nDCG: 0.6473238895392965
Event: 2020_tornado_outbreak_of_march
	nDCG: 0.34489922509514587
Event: 2020_visakhapatnam_gas_leak
	nDCG: 0.4643018131832162
Event: tornado_outbreak_of_november_30_december_2018
	nDCG: 0.9209722784791589

System Event-Micro nDCG: 0.46171529549275986

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Overall performance
# --------------------------------------------------
# Average performance over information types
# Macro averaged (information types have equal weight)
# Does not average across events (larger events have more impact)
# Positive class is the target class
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

avgPrecision = 0.0
avgRecall = 0.0
avgF1 = 0.0
avgAccuracy = 0.0

avgPrecisionHigh = 0.0
avgRecallHigh = 0.0
avgF1High = 0.0
avgAccuracyHigh = 0.0

avgPrecisionLow = 0.0
avgRecallLow = 0.0
avgF1Low = 0.0
avgAccuracyLow = 0.0

for categoryId in informationTypes2Index.keys():
    categoryPrecision = precision_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryRecall = recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryF1 = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    categoryAccuracy = accuracy_score(category2GroundTruth[categoryId], category2Predicted[categoryId])
    
    avgPrecision = avgPrecision + categoryPrecision
    avgRecall = avgRecall + categoryRecall
    avgF1 = avgF1 + categoryF1
    avgAccuracy = avgAccuracy + categoryAccuracy
    
    if any(categoryId in s for s in highImportCategories):
        avgPrecisionHigh = avgPrecisionHigh + categoryPrecision
        avgRecallHigh = avgRecallHigh + categoryRecall
        avgF1High = avgF1High + categoryF1
        avgAccuracyHigh = avgAccuracyHigh + categoryAccuracy
    else:
        avgPrecisionLow = avgPrecisionLow + categoryPrecision
        avgRecallLow = avgRecallLow + categoryRecall
        avgF1Low = avgF1Low + categoryF1
        avgAccuracyLow = avgAccuracyLow + categoryAccuracy

numInformationTypes = len(informationTypes2Index)
numHighInformationTypes = len(highImportCategories)
numLowInformationTypes = numInformationTypes - numHighInformationTypes
        
print("Information Type Precision (positive class, multi-type, macro): "+str(avgPrecision/numInformationTypes))
print("Information Type Recall (positive class, multi-type, macro): "+str(avgRecall/numInformationTypes))
print("Information Type F1 (positive class, multi-type, macro): "+str(avgF1/numInformationTypes))
print("Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracy/numInformationTypes))

print("High Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionHigh/numHighInformationTypes))
print("High Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallHigh/numHighInformationTypes))
print("High Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1High/numHighInformationTypes))
print("High Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyHigh/numHighInformationTypes))

print("Low Importance Information Type Precision (positive class, multi-type, macro): "+str(avgPrecisionLow/numLowInformationTypes))
print("Low Importance Information Type Recall (positive class, multi-type, macro): "+str(avgRecallLow/numLowInformationTypes))
print("Low Importance Information Type F1 (positive class, multi-type, macro): "+str(avgF1Low/numLowInformationTypes))
print("Low Importance Information Type Accuracy (overall, multi-type, macro): "+str(avgAccuracyLow/numLowInformationTypes))

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Type Categorization"+"\n")
resultsFile.write("Overall performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecision/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecall/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1/len(informationTypes2Index))+"\n")
resultsFile.write("> Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracy/len(informationTypes2Index))+"\n")
resultsFile.write("> High Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallHigh/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1High/numHighInformationTypes)+"\n")
resultsFile.write("> High Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Precision (positive class, multi-type, macro):"+"\t"+str(avgPrecisionLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Recall (positive class, multi-type, macro):"+"\t"+str(avgRecallLow/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type F1 (positive class, multi-type, macro):"+"\t"+str(avgF1Low/numLowInformationTypes)+"\n")
resultsFile.write("> Low Importance Information Type Accuracy (overall, multi-type, macro):"+"\t"+str(avgAccuracyLow/numLowInformationTypes)+"\n")
resultsFile.write(""+"\n")

Information Type Precision (positive class, multi-type, macro): 0.24369271856320984
Information Type Recall (positive class, multi-type, macro): 0.4298112854341195
Information Type F1 (positive class, multi-type, macro): 0.2923145682196217
Information Type Accuracy (overall, multi-type, macro): 0.8685443117311035
High Importance Information Type Precision (positive class, multi-type, macro): 0.2473093410887588
High Importance Information Type Recall (positive class, multi-type, macro): 0.33132986454368046
High Importance Information Type F1 (positive class, multi-type, macro): 0.2670083076252519
High Importance Information Type Accuracy (overall, multi-type, macro): 0.9537900900739573
Low Importance Information Type Precision (positive class, multi-type, macro): 0.2425506272393523
Low Importance Information Type Recall (positive class, multi-type, macro): 0.46091068150478465
Low Importance Information Type F1 (positive class, multi-type, macro): 0.30030601893363323
Low Importance Information Type Accuracy (overall, multi-type, macro): 0.8416245922544124

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type Performance
# --------------------------------------------------
# Per Category Classification Performance with confusion matrices
# Performance on the target class is what we care about here, 
# primaraly with respect to recall, as we want the user to 
# see all of the information for a given category. A small
# amount of noise being added to the feed is an acceptable
# cost for good recall.
#
# Does not average across events (larger events have more impact)

from sklearn.metrics import classification_report

perTopicFile.write("--------------------------------------------------"+"\n")
perTopicFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perTopicFile.write("Per Information Type Performance"+"\n")
perTopicFile.write("--------------------------------------------------"+"\n")

for categoryId in informationTypes2Index.keys():
    target_names = ['Other Classes', categoryId]
    try:
        print(categoryId)
        print(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names))


        perTopicFile.write(categoryId+"\n")
        perTopicFile.write(classification_report(category2GroundTruth[categoryId], category2Predicted[categoryId], target_names=target_names)+"\n")
        perTopicFile.write(""+"\n")
      
    except ValueError:
        print("Category "+categoryId+" score calculation failed, likely due the category not being used by the run")
perTopicFile.write(""+"\n")

CallToAction-Donations
                        precision    recall  f1-score   support

         Other Classes       1.00      0.98      0.99     55275
CallToAction-Donations       0.26      0.66      0.37       568

              accuracy                           0.98     55843
             macro avg       0.63      0.82      0.68     55843
          weighted avg       0.99      0.98      0.98     55843

CallToAction-MovePeople
                         precision    recall  f1-score   support

          Other Classes       0.98      0.99      0.99     54646
CallToAction-MovePeople       0.46      0.31      0.37      1197

               accuracy                           0.98     55843
              macro avg       0.72      0.65      0.68     55843
           weighted avg       0.97      0.98      0.98     55843

CallToAction-Volunteer
                        precision    recall  f1-score   support

         Other Classes       1.00      0.99      0.99     55543
CallToAction-Volunteer       0.21      0.37      0.27       300

              accuracy                           0.99     55843
             macro avg       0.60      0.68      0.63     55843
          weighted avg       0.99      0.99      0.99     55843

Other-Advice
               precision    recall  f1-score   support

Other Classes       0.97      0.94      0.95     52602
 Other-Advice       0.33      0.44      0.38      3241

     accuracy                           0.92     55843
    macro avg       0.65      0.69      0.67     55843
 weighted avg       0.93      0.92      0.92     55843

Other-ContextualInformation
                             precision    recall  f1-score   support

              Other Classes       0.97      0.95      0.96     54346
Other-ContextualInformation       0.03      0.07      0.05      1497

                   accuracy                           0.93     55843
                  macro avg       0.50      0.51      0.50     55843
               weighted avg       0.95      0.93      0.94     55843

Other-Discussion
                  precision    recall  f1-score   support

   Other Classes       0.99      0.89      0.94     55263
Other-Discussion       0.02      0.27      0.05       580

        accuracy                           0.88     55843
       macro avg       0.51      0.58      0.49     55843
    weighted avg       0.98      0.88      0.93     55843

Other-Irrelevant
                  precision    recall  f1-score   support

   Other Classes       0.57      0.80      0.66     23267
Other-Irrelevant       0.80      0.56      0.66     32576

        accuracy                           0.66     55843
       macro avg       0.68      0.68      0.66     55843
    weighted avg       0.70      0.66      0.66     55843

Other-Sentiment
                 precision    recall  f1-score   support

  Other Classes       0.95      0.86      0.91     51270
Other-Sentiment       0.25      0.51      0.34      4573

       accuracy                           0.83     55843
      macro avg       0.60      0.69      0.62     55843
   weighted avg       0.89      0.83      0.86     55843

Report-CleanUp
                precision    recall  f1-score   support

 Other Classes       1.00      0.98      0.99     55581
Report-CleanUp       0.10      0.39      0.17       262

      accuracy                           0.98     55843
     macro avg       0.55      0.69      0.58     55843
  weighted avg       0.99      0.98      0.99     55843

Report-EmergingThreats
                        precision    recall  f1-score   support

         Other Classes       0.96      0.89      0.92     52454
Report-EmergingThreats       0.21      0.45      0.28      3389

              accuracy                           0.86     55843
             macro avg       0.58      0.67      0.60     55843
          weighted avg       0.92      0.86      0.89     55843

Report-Factoid
                precision    recall  f1-score   support

 Other Classes       0.95      0.89      0.92     49844
Report-Factoid       0.39      0.59      0.47      5999

      accuracy                           0.86     55843
     macro avg       0.67      0.74      0.69     55843
  weighted avg       0.89      0.86      0.87     55843

Report-FirstPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.97      0.92      0.95     54135
Report-FirstPartyObservation       0.08      0.23      0.12      1708

                    accuracy                           0.90     55843
                   macro avg       0.53      0.57      0.53     55843
                weighted avg       0.95      0.90      0.92     55843

Report-Hashtags
                 precision    recall  f1-score   support

  Other Classes       0.92      0.74      0.82     48407
Report-Hashtags       0.26      0.59      0.36      7436

       accuracy                           0.72     55843
      macro avg       0.59      0.67      0.59     55843
   weighted avg       0.83      0.72      0.76     55843

Report-Location
                 precision    recall  f1-score   support

  Other Classes       0.86      0.56      0.68     41325
Report-Location       0.38      0.75      0.50     14518

       accuracy                           0.61     55843
      macro avg       0.62      0.66      0.59     55843
   weighted avg       0.74      0.61      0.64     55843

Report-MultimediaShare
                        precision    recall  f1-score   support

         Other Classes       0.93      0.64      0.76     48784
Report-MultimediaShare       0.21      0.65      0.32      7059

              accuracy                           0.64     55843
             macro avg       0.57      0.65      0.54     55843
          weighted avg       0.84      0.64      0.70     55843

Report-News
               precision    recall  f1-score   support

Other Classes       0.95      0.74      0.83     50324
  Report-News       0.21      0.64      0.32      5519

     accuracy                           0.73     55843
    macro avg       0.58      0.69      0.57     55843
 weighted avg       0.88      0.73      0.78     55843

Report-NewSubEvent
                    precision    recall  f1-score   support

     Other Classes       0.98      0.96      0.97     54728
Report-NewSubEvent       0.06      0.12      0.08      1115

          accuracy                           0.95     55843
         macro avg       0.52      0.54      0.53     55843
      weighted avg       0.96      0.95      0.96     55843

Report-Official
                 precision    recall  f1-score   support

  Other Classes       0.96      0.96      0.96     53203
Report-Official       0.20      0.18      0.19      2640

       accuracy                           0.93     55843
      macro avg       0.58      0.57      0.58     55843
   weighted avg       0.92      0.93      0.92     55843

Report-OriginalEvent
                      precision    recall  f1-score   support

       Other Classes       0.95      0.96      0.95     52838
Report-OriginalEvent       0.09      0.07      0.08      3005

            accuracy                           0.91     55843
           macro avg       0.52      0.52      0.52     55843
        weighted avg       0.90      0.91      0.91     55843

Report-ServiceAvailable
                         precision    recall  f1-score   support

          Other Classes       0.98      0.97      0.97     53834
Report-ServiceAvailable       0.37      0.50      0.42      2009

               accuracy                           0.95     55843
              macro avg       0.67      0.73      0.70     55843
           weighted avg       0.96      0.95      0.95     55843

Report-ThirdPartyObservation
                              precision    recall  f1-score   support

               Other Classes       0.93      0.71      0.81     50379
Report-ThirdPartyObservation       0.15      0.48      0.23      5464

                    accuracy                           0.69     55843
                   macro avg       0.54      0.60      0.52     55843
                weighted avg       0.85      0.69      0.75     55843

Report-Weather
                precision    recall  f1-score   support

 Other Classes       0.98      0.86      0.92     50824
Report-Weather       0.36      0.79      0.49      5019

      accuracy                           0.86     55843
     macro avg       0.67      0.82      0.70     55843
  weighted avg       0.92      0.86      0.88     55843

Request-GoodsServices
                       precision    recall  f1-score   support

        Other Classes       1.00      0.99      0.99     55452
Request-GoodsServices       0.28      0.30      0.29       391

             accuracy                           0.99     55843
            macro avg       0.64      0.65      0.64     55843
         weighted avg       0.99      0.99      0.99     55843

Request-InformationWanted
                           precision    recall  f1-score   support

            Other Classes       0.99      0.98      0.99     55241
Request-InformationWanted       0.27      0.51      0.35       602

                 accuracy                           0.98     55843
                macro avg       0.63      0.75      0.67     55843
             weighted avg       0.99      0.98      0.98     55843

Request-SearchAndRescue
                         precision    recall  f1-score   support

          Other Classes       1.00      0.99      1.00     55737
Request-SearchAndRescue       0.10      0.32      0.16       106

               accuracy                           0.99     55843
              macro avg       0.55      0.66      0.58     55843
           weighted avg       1.00      0.99      1.00     55843

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Information Type F1 Graph
# --------------------------------------------------
# Per Category Classification Performance
# F1 scores for each information type, graphed
# Does not average across events (larger events have more impact)



N = len(informationTypes2Index)
ind = np.arange(N)

scoresPerCategoryF1 = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    localF1Score = f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary')
    print(categoryId, localF1Score)
    scoresPerCategoryF1.append(localF1Score)
    categoryLabels.append(categoryId)
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerCategoryF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Scores by Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

CallToAction-Donations 0.37132901941264307
CallToAction-MovePeople 0.36940110719677904
CallToAction-Volunteer 0.2668269230769231
Other-Advice 0.3787261982928431
Other-ContextualInformation 0.045829514207149404
Other-Discussion 0.04564991848228842
Other-Irrelevant 0.659798650452856
Other-Sentiment 0.33754486719310833
Report-CleanUp 0.16506410256410256
Report-EmergingThreats 0.2839459864966241
Report-Factoid 0.46656976744186046
Report-FirstPartyObservation 0.12065813528336382
Report-Hashtags 0.36028018129377837
Report-Location 0.5004152440712375
Report-MultimediaShare 0.3152376353728358
Report-News 0.31933905078353014
Report-NewSubEvent 0.0831758034026465
Report-Official 0.1890880441205436
Report-OriginalEvent 0.08060726549792156
Report-ServiceAvailable 0.42138630600169064
Report-ThirdPartyObservation 0.23370047965890922
Report-Weather 0.49418095357276937
Request-GoodsServices 0.2881773399014778
Request-InformationWanted 0.3549684089603676
Request-SearchAndRescue 0.15596330275229356


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event Performance
# --------------------------------------------------
# Categorization performance for each event
# Precision, recall and F1 only consider the positive class
# Accuracy is an overall metric
# We report performance for all categories, high importance categories and low importance categories
# Macro average (categories have equal weight)

perEventFile.write("--------------------------------------------------"+"\n")
perEventFile.write("EVALUATON: Information Type Categorization (Multi-type)"+"\n")
perEventFile.write("Per Event Performance"+"\n")
perEventFile.write("--------------------------------------------------"+"\n")

for eventId in eventIdentifiers:
    tavgPrecision = 0.0
    tavgRecall = 0.0
    tavgF1 = 0.0
    tavgAccuracy = 0.0

    categoryCount = 0
    
    for categoryId in informationTypes2Index.keys():
        if sum(event2groundtruth[eventId].get(categoryId)) == 0:
            continue
        
        categoryPrecision = precision_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryRecall = recall_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryF1 = f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        categoryAccuracy = accuracy_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId))
        
        tavgPrecision = tavgPrecision + categoryPrecision
        tavgRecall = tavgRecall + categoryRecall
        tavgF1 = tavgF1 + categoryF1
        tavgAccuracy = tavgAccuracy + categoryAccuracy
        
        categoryCount += 1
    
    if categoryCount == 0:
        print("No categories for event:", eventId)
        continue
    
    print(eventId)
    print("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/categoryCount))
    print("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/categoryCount))
    print("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/categoryCount))
    print("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/categoryCount))
    print("")
    
    perEventFile.write(eventId+"\n")
    perEventFile.write("  Information Type Precision (positive class, multi-type, macro): "+str(tavgPrecision/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Recall (positive class, multi-type, macro): "+str(tavgRecall/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type F1 (positive class, multi-type, macro): "+str(tavgF1/len(informationTypes2Index))+"\n")
    perEventFile.write("  Information Type Accuracy (overall, multi-type, macro): "+str(tavgAccuracy/len(informationTypes2Index))+"\n")
    perEventFile.write("\n")
    
perEventFile.write("\n")

2020_01_27_houston_explosion.2020
  Information Type Precision (positive class, multi-type, macro): 0.18906570906832088
  Information Type Recall (positive class, multi-type, macro): 0.41431186478276216
  Information Type F1 (positive class, multi-type, macro): 0.20007615731841197
  Information Type Accuracy (overall, multi-type, macro): 0.8641207124523607

2020_02_10_mideast_tornadoes.day1_mississipi.2020
  Information Type Precision (positive class, multi-type, macro): 0.4980527504231765
  Information Type Recall (positive class, multi-type, macro): 0.5669158258733621
  Information Type F1 (positive class, multi-type, macro): 0.49637216759451563
  Information Type Accuracy (overall, multi-type, macro): 0.8583850931677018

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2020_02_10_mideast_tornadoes.day2_al.2020
  Information Type Precision (positive class, multi-type, macro): 0.21715853791818426
  Information Type Recall (positive class, multi-type, macro): 0.46853999618277214
  Information Type F1 (positive class, multi-type, macro): 0.25221616643311434
  Information Type Accuracy (overall, multi-type, macro): 0.8901758698940999

2020_02_10_mideast_tornadoes.day3_md.2019
  Information Type Precision (positive class, multi-type, macro): 0.13667121077503622
  Information Type Recall (positive class, multi-type, macro): 0.39924406549341135
  Information Type F1 (positive class, multi-type, macro): 0.1593473222461694
  Information Type Accuracy (overall, multi-type, macro): 0.8404545454545456

2020_05_06_tn_derecho.2020
  Information Type Precision (positive class, multi-type, macro): 0.254035584297931
  Information Type Recall (positive class, multi-type, macro): 0.3717208528274767
  Information Type F1 (positive class, multi-type, macro): 0.2516933541540216
  Information Type Accuracy (overall, multi-type, macro): 0.8723851203501094

brooklynblockparty_shooting.2019
  Information Type Precision (positive class, multi-type, macro): 0.13140706423281348
  Information Type Recall (positive class, multi-type, macro): 0.523210171864904
  Information Type F1 (positive class, multi-type, macro): 0.14855776526727227
  Information Type Accuracy (overall, multi-type, macro): 0.8576797645561565

2016_puttingal_temple
  Information Type Precision (positive class, multi-type, macro): 0.16730457731014606
  Information Type Recall (positive class, multi-type, macro): 0.3523575596007596
  Information Type F1 (positive class, multi-type, macro): 0.18071716752589534
  Information Type Accuracy (overall, multi-type, macro): 0.8550094517958411

2017_12_04_thomas_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.28142013098305957
  Information Type Recall (positive class, multi-type, macro): 0.4283335292065214
  Information Type F1 (positive class, multi-type, macro): 0.31799595539560727
  Information Type Accuracy (overall, multi-type, macro): 0.8331632991056642

2017_12_07_lilac_wildfire.2017
  Information Type Precision (positive class, multi-type, macro): 0.3214191183158574
  Information Type Recall (positive class, multi-type, macro): 0.45893910956330997
  Information Type F1 (positive class, multi-type, macro): 0.34377294905818445
  Information Type Accuracy (overall, multi-type, macro): 0.846729939603106

2018_07_23_klamathon_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.34755115335548836
  Information Type Recall (positive class, multi-type, macro): 0.46128757913890067
  Information Type F1 (positive class, multi-type, macro): 0.348689384712254
  Information Type Accuracy (overall, multi-type, macro): 0.8447636573374716

2018_08_05_holy_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.15176496180192312
  Information Type Recall (positive class, multi-type, macro): 0.5028524233414902
  Information Type F1 (positive class, multi-type, macro): 0.18010556462457053
  Information Type Accuracy (overall, multi-type, macro): 0.8920269642026964

2018_11_07_Woolsey_wildfire.2018
  Information Type Precision (positive class, multi-type, macro): 0.1634253593130204
  Information Type Recall (positive class, multi-type, macro): 0.33118681032365216
  Information Type F1 (positive class, multi-type, macro): 0.1870395808692137
  Information Type Accuracy (overall, multi-type, macro): 0.8567980414468467

2018_maryland_flood
  Information Type Precision (positive class, multi-type, macro): 0.2940747770542361
  Information Type Recall (positive class, multi-type, macro): 0.4904045622118472
  Information Type F1 (positive class, multi-type, macro): 0.33620375729361734
  Information Type Accuracy (overall, multi-type, macro): 0.8567024824008895

2018_pittsburgh_synagogue_shooting
  Information Type Precision (positive class, multi-type, macro): 0.47606152375889216
  Information Type Recall (positive class, multi-type, macro): 0.533813901130243
  Information Type F1 (positive class, multi-type, macro): 0.4208606076413792
  Information Type Accuracy (overall, multi-type, macro): 0.7521367521367521

2019_03_01_alberta_wildfire.2019.v2
  Information Type Precision (positive class, multi-type, macro): 0.09588696664517639
  Information Type Recall (positive class, multi-type, macro): 0.3781742129210757
  Information Type F1 (positive class, multi-type, macro): 0.07771503739515583
  Information Type Accuracy (overall, multi-type, macro): 0.8090595881293553

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_08_25_hurricane_dorian.2019
  Information Type Precision (positive class, multi-type, macro): 0.23850813091385678
  Information Type Recall (positive class, multi-type, macro): 0.3292052386036212
  Information Type F1 (positive class, multi-type, macro): 0.22305240629786038
  Information Type Accuracy (overall, multi-type, macro): 0.8437779618889807

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2019_10_10_saddleridge_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.2548569950523884
  Information Type Recall (positive class, multi-type, macro): 0.4443337776629937
  Information Type F1 (positive class, multi-type, macro): 0.28466342639620096
  Information Type Accuracy (overall, multi-type, macro): 0.8854134084037119

2019_10_25_kincade_wildfire.2019
  Information Type Precision (positive class, multi-type, macro): 0.2816872728730423
  Information Type Recall (positive class, multi-type, macro): 0.49914309551344394
  Information Type F1 (positive class, multi-type, macro): 0.33512903879204087
  Information Type Accuracy (overall, multi-type, macro): 0.8734559429041997

2019_durham_gas_explosion
  Information Type Precision (positive class, multi-type, macro): 0.23401551567907963
  Information Type Recall (positive class, multi-type, macro): 0.46522526692525107
  Information Type F1 (positive class, multi-type, macro): 0.2870242633216318
  Information Type Accuracy (overall, multi-type, macro): 0.8580164726281893

2019_saugus_high_school_shooting
  Information Type Precision (positive class, multi-type, macro): 0.19995187124936853
  Information Type Recall (positive class, multi-type, macro): 0.3536031150627514
  Information Type F1 (positive class, multi-type, macro): 0.2297368056857633
  Information Type Accuracy (overall, multi-type, macro): 0.880112068244543

2019_townsville_flood
  Information Type Precision (positive class, multi-type, macro): 0.3089657332795816
  Information Type Recall (positive class, multi-type, macro): 0.44869635982138445
  Information Type F1 (positive class, multi-type, macro): 0.3231828861692738
  Information Type Accuracy (overall, multi-type, macro): 0.8674198017691569

2020_easter_tornado_outbreak
  Information Type Precision (positive class, multi-type, macro): 0.1501071514205996
  Information Type Recall (positive class, multi-type, macro): 0.485594776367891
  Information Type F1 (positive class, multi-type, macro): 0.18798202703605815
  Information Type Accuracy (overall, multi-type, macro): 0.8397642015005359

2020_tornado_outbreak_of_april
  Information Type Precision (positive class, multi-type, macro): 0.26145533978956415
  Information Type Recall (positive class, multi-type, macro): 0.45054781645477116
  Information Type F1 (positive class, multi-type, macro): 0.2792072453241433
  Information Type Accuracy (overall, multi-type, macro): 0.8508506284712071

2020_tornado_outbreak_of_march
  Information Type Precision (positive class, multi-type, macro): 0.1797162665078718
  Information Type Recall (positive class, multi-type, macro): 0.4510845422577032
  Information Type F1 (positive class, multi-type, macro): 0.1969310547875892
  Information Type Accuracy (overall, multi-type, macro): 0.8232646794091173

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

2020_visakhapatnam_gas_leak
  Information Type Precision (positive class, multi-type, macro): 0.2950390566472158
  Information Type Recall (positive class, multi-type, macro): 0.2435544536319234
  Information Type F1 (positive class, multi-type, macro): 0.2208546764956771
  Information Type Accuracy (overall, multi-type, macro): 0.8413157061431286

tornado_outbreak_of_november_30_december_2018
  Information Type Precision (positive class, multi-type, macro): 0.17693214818173578
  Information Type Recall (positive class, multi-type, macro): 0.5263029817517529
  Information Type F1 (positive class, multi-type, macro): 0.22346950382195596
  Information Type Accuracy (overall, multi-type, macro): 0.845751633986928

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Type Categorization
# Per Event F1 Graph
# --------------------------------------------------
# Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
# Macro average (categories have equal weight)

N = len(eventIdentifiers)
ind = np.arange(N)

scoresPerEventF1 = []
for eventId in eventIdentifiers:
    avgF1_ = 0.0
    
    for categoryId in informationTypes2Index.keys():
        avgF1_ = avgF1_ + f1_score(event2groundtruth[eventId].get(categoryId), event2prediction[eventId].get(categoryId), average='binary')
        
    scoresPerEventF1.append(avgF1_/len(informationTypes2Index))
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, scoresPerEventF1, width)

plt.ylabel('F1 Scores')
plt.title('F1 Category Scores by Event')
plt.xticks(ind, eventIdentifiers, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()

/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(
/Users/cbuntain/Development/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1492: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# F1 performance over information types, higher is better
# Macro average (categories have equal weight)

from sklearn.metrics import mean_squared_error

priorityAvgf1 = 0.0;
priorityAvgf1High = 0.0;
priorityAvgf1Low = 0.0;
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]

    f1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    priorityAvgf1 = priorityAvgf1 + f1;
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgf1High = priorityAvgf1High + f1
    else:
        priorityAvgf1Low = priorityAvgf1Low + f1
    
    
    
print("Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index)))
    
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Level"+"\n")
resultsFile.write("Overall Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Label Prediction (F1, macro): "+str(priorityAvgf1/len(informationTypes2Index))+"\n")
resultsFile.write("\n")

Priority Label Prediction (F1, macro): 0.26227911153135275

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Overall Performance
# --------------------------------------------------
# How divergent is the system from the human priority labels?
# Use Pearson correlation here to capture parallel increases

priorityAvgCorr = 0.0
priorityAvgCorrHigh = 0.0
priorityAvgCorrLow = 0.0
for categoryId in informationTypes2Index.keys():
    if categoryId == "Other-Irrelevant":
        continue
        
    groundTruthPriorities = [priorityScoreMap[x] for x in category2GroundTruthPriority[categoryId]]
    predictedPriorities = category2PredictedPriorityScore[categoryId]

    # Pathological case when no variation exists in the predictions needs to be handled
    this_corr = 0.0
    if np.mean(np.array(predictedPriorities) - np.mean(predictedPriorities)) != 0.0:
        this_corr = np.corrcoef(groundTruthPriorities, predictedPriorities)[0,1]
    priorityAvgCorr = priorityAvgCorr + this_corr
    
    if any(categoryId in s for s in highImportCategories):
        priorityAvgCorrHigh = priorityAvgCorrHigh + this_corr
    else:
        priorityAvgCorrLow = priorityAvgCorrLow + this_corr
    
print("Priority Score Prediction (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1)))
print("Priority Score Prediction, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes))
print("Priority Score Prediction, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1)))


resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("EVALUATON: Information Priority Score"+"\n")
resultsFile.write("Correlational Performance"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("> Priority Correlation (Pearson): "+str(priorityAvgCorr/(len(informationTypes2Index)-1))+"\n")
resultsFile.write("> Priority Correlation, High (Pearson): "+str(priorityAvgCorrHigh/numHighInformationTypes)+"\n")
resultsFile.write("> Priority Correlation, Low (Pearson): "+str(priorityAvgCorrLow/(numLowInformationTypes-1))+"\n")
resultsFile.write("\n")

Priority Score Prediction (Pearson): 0.331435585391442
Priority Score Prediction, High (Pearson): 0.2886311718647097
Priority Score Prediction, Low (Pearson): 0.3457037232336862

1


# --------------------------------------------------
# TREC-IS 2021-A
# Information Priority Level
# Per Information Type Performance
# --------------------------------------------------
# F1 per information type (macro averaged), higher is better
# Macro average (categories have equal weight)

N = len(informationTypes2Index)
ind = np.arange(N)

priorityCatF1Values = []
categoryLabels = []
for categoryId in informationTypes2Index.keys():
    groundTruthPriorities = category2GroundTruthPriority[categoryId]
    predictedPriorities = category2PredictedPriority[categoryId]
    priorityCatF1 = f1_score(groundTruthPriorities, predictedPriorities, average='macro')
    if (math.isnan(priorityCatF1)):
        priorityCatF1 = 0.0
    categoryLabels.append(categoryId)
    priorityCatF1Values.append(priorityCatF1);
    
width = 0.90       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, priorityCatF1Values, width)

plt.ylabel('Priorty Label Prediction F1 (higher is better)')
plt.title('Priorty Label Prediction F1 Per Information Type')
plt.xticks(ind, categoryLabels, rotation='vertical')
plt.yticks(np.arange(0, 1, 0.1))

plt.show()


resultLine = None

# Print the evaluation table row in latex
print("Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\\\")

resultLine = (str.format('{0:.4f}', system_ndcg_micro)+
     " & "+
     str.format('{0:.4f}',avgF1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgF1/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',avgAccuracy/numInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index))+
     " & "+
     str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes)+
     " & "+
     str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index))+
     " \\\\")

print(runName+" & "+resultLine)

resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write("LATEX"+"\n")
resultsFile.write("--------------------------------------------------"+"\n")
resultsFile.write(runName+" & "+resultLine + "\n")

Run & NDCG & CF1-H & CF1-A & CAcc & PErr-H & PErr-A & PCorr-H & PCorr-A \\
ens & 0.4617 & 0.2670 & 0.2923 & 0.8685 & 0.2817 & 0.2623 & 0.2886 & 0.3182 \\

79


# Done
resultsFile.close() 
perTopicFile.close()
perEventFile.close()


# header = [
#     "Run",
#     "date",
#     "team",
#     "description",
#     "paper",
#     "code",
#     "nDCG@100",
#     "Info-Type F1 [Actionable]",
#     "Info-Type F1 [All]",
#     "Info-Type Accuracy",
#     "Priority F1 [Actionable]",
#     "Priority F1 [All]",
#     "Priority R [Actionable]",
#     "Priority R [All]",
# ]

import csv
if os.path.isfile("metadata.json"):
    this_cwd = os.getcwd()
    sub_date_ = this_cwd.partition("submissions/")[-1].partition("-")[0]
    sub_date = "%s/%s/%s" % (sub_date_[:4], sub_date_[4:6], sub_date_[6:])
    
    leaderboard_entry = None
    with open("metadata.json", "r") as in_file:
        
        metadata = json.load(in_file)
        
        leaderboard_entry = [
            runName,
            sub_date,
            metadata["organization"].lower(),
            metadata["model_description"],
            metadata["paper"] if metadata["paper"].startswith("http") else "",
            metadata["code"] if metadata["code"].startswith("http") else "",
            str.format('{0:.4f}',system_ndcg_micro),
            str.format('{0:.4f}',avgF1High/numHighInformationTypes),
            str.format('{0:.4f}',avgF1/numInformationTypes),
            str.format('{0:.4f}',avgAccuracy/numInformationTypes),
            str.format('{0:.4f}',priorityAvgf1High/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgf1/len(informationTypes2Index)),
            str.format('{0:.4f}',priorityAvgCorrHigh/numHighInformationTypes),
            str.format('{0:.4f}',priorityAvgCorr/len(informationTypes2Index)),
        ]
        
    with open(runName+".v"+str(version)+"."+edition+".leaderboard.csv","w") as csvResultsFile:
        leader_writer = csv.writer(csvResultsFile)
        leader_writer.writerow(leaderboard_entry)