Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AligNarr/compareScriptSummary.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
169 lines (134 sloc)
4.57 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import extractScriptEntities | |
import extractSummaryEntities | |
import spacy | |
import gensim | |
from nltk import ngrams | |
import processAlignment | |
#from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
summarySceneMap = dict() | |
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True) | |
scenes = extractScriptEntities.extractScriptEntities('script.xml', model) | |
summarySentences = extractSummaryEntities.extractSummaryEntities('wikiplot.txt', model) | |
similarity_threshold = 0.54 | |
word_inclusion_threshold = 1.0 | |
entity_inclusion_threshold = 0.43 | |
def removeOne(lemma, dictionary): | |
if lemma not in dictionary: | |
return | |
elif dictionary[lemma] == 1: | |
del dictionary[lemma] | |
else: | |
dictionary[lemma] -= 1 | |
def getLength(collection): | |
if collection is None: | |
return 0 | |
else: | |
return len(collection) | |
def countEntities(sceneEntities, summaryEntities): | |
if sceneEntities is None or summaryEntities is None: | |
return matched | |
matched = 0 | |
#print(sceneEntities, '\n', summaryEntities, '\n') | |
for sceneEntity in sceneEntities: | |
if sceneEntity in summaryEntities: | |
summaryEntities.remove(sceneEntity) | |
matched += 1 | |
return matched | |
def countWords(sceneBOW, summarySentenceBOW): | |
if sceneBOW is None or summarySentenceBOW is None: | |
return 0 | |
matched = 0 | |
matched_set = set() | |
for summaryWord in summarySentenceBOW: | |
for sceneWord in sceneBOW: | |
try: | |
if model.similarity(summaryWord, sceneWord) >= similarity_threshold: | |
matched += 1 | |
matched_set.add (summaryWord) | |
break | |
except: | |
print("Exception: ", summaryWord, ", ", sceneWord) | |
summarySentenceBOW -= matched_set | |
return matched | |
def checkInclusion(matched, total, threshold): | |
if total == 0: | |
return True | |
if matched/total >= threshold: | |
return True | |
return False | |
def getF1Score(groundTruthMap,summarySceneMap): | |
print("Ground Truth: ", groundTruthMap, '\n', "Generated: ", summarySceneMap) | |
falsePositives = 0 | |
falseNegatives = 0 | |
truePositives = 0 | |
for sceneNum in groundTruthMap: | |
if sceneNum not in summarySceneMap: | |
falseNegatives += len(groundTruthMap[sceneNum]) | |
continue | |
for sentenceNum in groundTruthMap[sceneNum]: | |
if sentenceNum in summarySceneMap[sceneNum]: | |
truePositives += 1 | |
else: | |
falseNegatives += 1 | |
for sceneNum in summarySceneMap: | |
if sceneNum not in groundTruthMap: | |
falsePositives += len(summarySceneMap[sceneNum]) | |
continue | |
for sentenceNum in summarySceneMap[sceneNum]: | |
if sentenceNum not in groundTruthMap[sceneNum]: | |
falsePositives += 1 | |
if truePositives + falsePositives != 0: | |
precision = truePositives / (truePositives + falsePositives) | |
else: | |
precision = 0 | |
if truePositives+falseNegatives != 0: | |
recall = truePositives / (truePositives + falseNegatives) | |
else: | |
recall = 0 | |
if precision == 0 or recall == 0: | |
F1 = 0 | |
else: | |
F1 = 2 * precision * recall / (precision + recall) | |
print("Precision: ", precision, "Recall: ", recall, "F1: ", F1) | |
sceneID = 0 | |
for summarySentenceID in range(len(summarySentences)): | |
summarySentenceBOW = summarySentences[summarySentenceID].bagOfWords | |
summaryNgrams = summarySentences[summarySentenceID].Ngrams | |
total_Ngrams = getLength(summaryNgrams) | |
total_summaryBOW = getLength(summarySentenceBOW) | |
matched_summaryWords = 0 | |
matched_Ngrams = 0 | |
if sceneID >= len(scenes): | |
sceneID = startSceneID | |
startSceneID = sceneID | |
while sceneID < len(scenes): | |
matched_Ngrams += countEntities(scenes[sceneID].Ngrams, summaryNgrams) | |
matched_summaryWords += countWords(scenes[sceneID].bagOfWords, summarySentenceBOW) | |
sceneID += 1 | |
if checkInclusion(matched_summaryWords, total_summaryBOW, word_inclusion_threshold) \ | |
and checkInclusion(matched_Ngrams, total_Ngrams, entity_inclusion_threshold): | |
summarySceneMap[summarySentenceID] = set(number for number in range(startSceneID, sceneID)) | |
sceneID -= 1 | |
break | |
groundTruthMap = processAlignment.processAlignment() | |
getF1Score(groundTruthMap, summarySceneMap) | |
""" | |
summarySentences[0].output() | |
print("\n\n") | |
scenes[0].output() | |
nlp = spacy.load('en') | |
sid = SentimentIntensityAnalyzer() | |
summarySentenceNum = 0 | |
for summarySentence in summarySentences: | |
sentence = summarySentence.sentence | |
scriptSentenceNum = 0 | |
for scene in scenes: | |
sceneSentences = scene.sentences | |
for sceneSentence in sceneSentences: | |
similarity = nlp(sentence).similarity(nlp(sceneSentence)) | |
if similarity >= 0.8: | |
print (sentence, sid.polarity_scores(sentence), sceneSentence, sid.polarity_scores(sceneSentence), '\n', similarity, '\n') | |
#print (summarySentenceNum, scriptSentenceNum, '\n') | |
scriptSentenceNum = scriptSentenceNum + 1 | |
summarySentenceNum = summarySentenceNum + 1 | |
""" |