compareScriptSummary.py

import extractScriptEntities
import extractSummaryEntities
import spacy
import gensim
from nltk import ngrams
import processAlignment

#from nltk.sentiment.vader import SentimentIntensityAnalyzer

summarySceneMap = dict()

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

scenes = extractScriptEntities.extractScriptEntities('script.xml', model)


summarySentences = extractSummaryEntities.extractSummaryEntities('wikiplot.txt', model)

similarity_threshold = 0.54
word_inclusion_threshold = 1.0

entity_inclusion_threshold = 0.43


def removeOne(lemma, dictionary):
	if lemma not in dictionary:
		return
	elif dictionary[lemma] == 1:
		del dictionary[lemma]
	else:
		dictionary[lemma] -= 1

def getLength(collection):
	if collection is None:
		return 0
	else:
		return len(collection)

def countEntities(sceneEntities, summaryEntities):
	if sceneEntities is None or summaryEntities is None:
		return matched
	matched = 0
	#print(sceneEntities, '\n', summaryEntities, '\n')
	for sceneEntity in sceneEntities:
		if sceneEntity in summaryEntities:
			summaryEntities.remove(sceneEntity)
			matched += 1
	return matched


def countWords(sceneBOW, summarySentenceBOW):
	if sceneBOW is None or summarySentenceBOW is None:
		return 0
	matched = 0
	matched_set = set()
	for summaryWord in summarySentenceBOW:
		for sceneWord in sceneBOW:
			try:
				if model.similarity(summaryWord, sceneWord) >= similarity_threshold:
					matched += 1
					matched_set.add (summaryWord)
					break
			except:
				print("Exception: ", summaryWord, ", ", sceneWord)
	summarySentenceBOW -= matched_set
	return matched
def checkInclusion(matched, total, threshold):
	if total == 0:
		return True
	if matched/total >= threshold:
		return True
	return False

def getF1Score(groundTruthMap,summarySceneMap):
	print("Ground Truth: ", groundTruthMap, '\n', "Generated: ", summarySceneMap)


	falsePositives = 0
	falseNegatives = 0
	truePositives = 0
	for sceneNum in groundTruthMap:
		if sceneNum not in summarySceneMap:

			falseNegatives += len(groundTruthMap[sceneNum])
			continue
		for sentenceNum in groundTruthMap[sceneNum]:
			if sentenceNum in summarySceneMap[sceneNum]:
				truePositives += 1
			else:
				falseNegatives += 1
	for sceneNum in summarySceneMap:
		if sceneNum not in groundTruthMap:
			falsePositives += len(summarySceneMap[sceneNum])
			continue
		for sentenceNum in summarySceneMap[sceneNum]:
			if sentenceNum not in groundTruthMap[sceneNum]:
				falsePositives += 1
	if truePositives + falsePositives != 0:
		precision = truePositives / (truePositives + falsePositives)
	else:
		precision = 0
	if truePositives+falseNegatives != 0:
		recall = truePositives / (truePositives + falseNegatives)
	else:
		recall = 0
	if precision == 0 or recall == 0:
		F1 = 0
	else:
		F1 = 2 * precision * recall / (precision + recall)
	print("Precision: ", precision, "Recall: ", recall, "F1: ", F1)

sceneID = 0
for summarySentenceID in range(len(summarySentences)):
	summarySentenceBOW = summarySentences[summarySentenceID].bagOfWords
	summaryNgrams = summarySentences[summarySentenceID].Ngrams

	total_Ngrams = getLength(summaryNgrams)
	total_summaryBOW = getLength(summarySentenceBOW)


	matched_summaryWords = 0
	matched_Ngrams = 0

	if sceneID >= len(scenes):
		sceneID = startSceneID
	startSceneID = sceneID
	while sceneID < len(scenes):
		matched_Ngrams += countEntities(scenes[sceneID].Ngrams, summaryNgrams)
		matched_summaryWords += countWords(scenes[sceneID].bagOfWords, summarySentenceBOW)


		sceneID += 1

		if checkInclusion(matched_summaryWords, total_summaryBOW, word_inclusion_threshold) \
		and checkInclusion(matched_Ngrams, total_Ngrams, entity_inclusion_threshold):
			summarySceneMap[summarySentenceID] = set(number for number in range(startSceneID, sceneID))
			sceneID -= 1
			break
groundTruthMap = processAlignment.processAlignment()
getF1Score(groundTruthMap, summarySceneMap)


"""
summarySentences[0].output()

print("\n\n")
scenes[0].output()


nlp = spacy.load('en')
sid = SentimentIntensityAnalyzer()

summarySentenceNum = 0
for summarySentence in summarySentences:
	sentence = summarySentence.sentence
	scriptSentenceNum = 0
	for scene in scenes:
		sceneSentences = scene.sentences
		for sceneSentence in sceneSentences:
			similarity =  nlp(sentence).similarity(nlp(sceneSentence))
			if similarity >= 0.8:
				print (sentence, sid.polarity_scores(sentence), sceneSentence, sid.polarity_scores(sceneSentence), '\n', similarity, '\n')
				#print (summarySentenceNum, scriptSentenceNum, '\n')
			scriptSentenceNum = scriptSentenceNum + 1
	summarySentenceNum = summarySentenceNum + 1

"""
	import extractScriptEntities
	import extractSummaryEntities
	import spacy
	import gensim
	from nltk import ngrams
	import processAlignment

	#from nltk.sentiment.vader import SentimentIntensityAnalyzer

	summarySceneMap = dict()

	model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

	scenes = extractScriptEntities.extractScriptEntities('script.xml', model)


	summarySentences = extractSummaryEntities.extractSummaryEntities('wikiplot.txt', model)

	similarity_threshold = 0.54
	word_inclusion_threshold = 1.0

	entity_inclusion_threshold = 0.43


	def removeOne(lemma, dictionary):
	if lemma not in dictionary:
	return
	elif dictionary[lemma] == 1:
	del dictionary[lemma]
	else:
	dictionary[lemma] -= 1

	def getLength(collection):
	if collection is None:
	return 0
	else:
	return len(collection)

	def countEntities(sceneEntities, summaryEntities):
	if sceneEntities is None or summaryEntities is None:
	return matched
	matched = 0
	#print(sceneEntities, '\n', summaryEntities, '\n')
	for sceneEntity in sceneEntities:
	if sceneEntity in summaryEntities:
	summaryEntities.remove(sceneEntity)
	matched += 1
	return matched


	def countWords(sceneBOW, summarySentenceBOW):
	if sceneBOW is None or summarySentenceBOW is None:
	return 0
	matched = 0
	matched_set = set()
	for summaryWord in summarySentenceBOW:
	for sceneWord in sceneBOW:
	try:
	if model.similarity(summaryWord, sceneWord) >= similarity_threshold:
	matched += 1
	matched_set.add (summaryWord)
	break
	except:
	print("Exception: ", summaryWord, ", ", sceneWord)
	summarySentenceBOW -= matched_set
	return matched
	def checkInclusion(matched, total, threshold):
	if total == 0:
	return True
	if matched/total >= threshold:
	return True
	return False

	def getF1Score(groundTruthMap,summarySceneMap):
	print("Ground Truth: ", groundTruthMap, '\n', "Generated: ", summarySceneMap)


	falsePositives = 0
	falseNegatives = 0
	truePositives = 0
	for sceneNum in groundTruthMap:
	if sceneNum not in summarySceneMap:

	falseNegatives += len(groundTruthMap[sceneNum])
	continue
	for sentenceNum in groundTruthMap[sceneNum]:
	if sentenceNum in summarySceneMap[sceneNum]:
	truePositives += 1
	else:
	falseNegatives += 1
	for sceneNum in summarySceneMap:
	if sceneNum not in groundTruthMap:
	falsePositives += len(summarySceneMap[sceneNum])
	continue
	for sentenceNum in summarySceneMap[sceneNum]:
	if sentenceNum not in groundTruthMap[sceneNum]:
	falsePositives += 1
	if truePositives + falsePositives != 0:
	precision = truePositives / (truePositives + falsePositives)
	else:
	precision = 0
	if truePositives+falseNegatives != 0:
	recall = truePositives / (truePositives + falseNegatives)
	else:
	recall = 0
	if precision == 0 or recall == 0:
	F1 = 0
	else:
	F1 = 2 * precision * recall / (precision + recall)
	print("Precision: ", precision, "Recall: ", recall, "F1: ", F1)

	sceneID = 0
	for summarySentenceID in range(len(summarySentences)):
	summarySentenceBOW = summarySentences[summarySentenceID].bagOfWords
	summaryNgrams = summarySentences[summarySentenceID].Ngrams

	total_Ngrams = getLength(summaryNgrams)
	total_summaryBOW = getLength(summarySentenceBOW)


	matched_summaryWords = 0
	matched_Ngrams = 0

	if sceneID >= len(scenes):
	sceneID = startSceneID
	startSceneID = sceneID
	while sceneID < len(scenes):
	matched_Ngrams += countEntities(scenes[sceneID].Ngrams, summaryNgrams)
	matched_summaryWords += countWords(scenes[sceneID].bagOfWords, summarySentenceBOW)


	sceneID += 1

	if checkInclusion(matched_summaryWords, total_summaryBOW, word_inclusion_threshold) \
	and checkInclusion(matched_Ngrams, total_Ngrams, entity_inclusion_threshold):
	summarySceneMap[summarySentenceID] = set(number for number in range(startSceneID, sceneID))
	sceneID -= 1
	break
	groundTruthMap = processAlignment.processAlignment()
	getF1Score(groundTruthMap, summarySceneMap)




	"""
	summarySentences[0].output()

	print("\n\n")
	scenes[0].output()


	nlp = spacy.load('en')
	sid = SentimentIntensityAnalyzer()

	summarySentenceNum = 0
	for summarySentence in summarySentences:
	sentence = summarySentence.sentence
	scriptSentenceNum = 0
	for scene in scenes:
	sceneSentences = scene.sentences
	for sceneSentence in sceneSentences:
	similarity = nlp(sentence).similarity(nlp(sceneSentence))
	if similarity >= 0.8:
	print (sentence, sid.polarity_scores(sentence), sceneSentence, sid.polarity_scores(sceneSentence), '\n', similarity, '\n')
	#print (summarySentenceNum, scriptSentenceNum, '\n')
	scriptSentenceNum = scriptSentenceNum + 1
	summarySentenceNum = summarySentenceNum + 1

	"""