Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import processAlignment
import ScriptProcessing
import spacy
import numpy
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
nlp = spacy.load('en')
#similarity_threshold = 0.7
dist_threshold = 6
output = open('output.txt', 'w')
def getSceneSentences(scenes):
sentences = []
for scene in scenes:
sceneSentences = scene.split('\n')
for sentence in sceneSentences:
if sentence != '':
sentences.append(sentence)
return sentences
def getAverage(sentences):
average = numpy.zeros(len(nlp(sentences[0]).vector))
for sentence in sentences:
average += nlp(sentence).vector
average /= len(sentences)
return average
def dist(v1, v2):
v = v1-v2
sum = 0
for i in range(len(v)):
sum += (v[i] * v[i])
return math.sqrt(sum)
def readSummary():
summaryFile = open('wikiplot.txt', 'r')
summ = summaryFile.readlines()
summary = []
for sentence in summ:
if sentence != '' and sentence != '\n':
summary.append(sentence)
return summary
def rankSceneSentences(scenes):
average = getAverage(scenes)
#minDist = 1000000000
summarySentence = ""
rank = [[dist(average,nlp(sentence).vector), sentence] for sentence in scenes]
"""
for sentence in scenes:
#sceneSentence = nlp(sentence)
d = dist(average, sentence.vector) #numpy.linalg.norm(average, sceneSentence.vector)
if d < minDist:
minDist = d
summarySentence = sentence
return summarySentence
"""
rank.sort()
return (rank, average)
"""
def summarizeScene(scenes):
average = getAverage(scenes)
minDist = 1000000000
summarySentence = ""
for sentence in scenes:
sceneSentence = nlp(sentence)
d = dist(average, sceneSentence.vector) #numpy.linalg.norm(average, sceneSentence.vector)
if d < minDist:
minDist = d
summarySentence = sentence
return (summarySentence, average)
"""
def rankScenesBySummarySentence(averages, summarySentence):
summary = nlp(summarySentence)
rank = []
for i in range (len(averages)):
print(i)
d = dist(averages[i], summary.vector)
rank.append((d, i))
rank.sort()
return rank
"""
def rankSceneSentencesBySummary(scenes, summary):
rank = [[nlp(sentence).similarity(nlp(summary)), sentence] for sentence in scenes]
rank.sort()
return rank
"""
"""
def alignSummarySentence(summarization, summarySentence):
alignment = []
for i in range(len(summarization)):
#summarizationSentence = summarization[i][0]
average = summarization[i][1]
#if nlp(summarySentence).similarity(nlp(summarizationSentence)) > inclusion_threshold:
d = dist(average, nlp(summarySentence).vector)
alignment.append([d, i])
#if d < dist_threshold:
#alignment.append(i)
alignment.sort()
return alignment
"""
def getWordsFromSentence(sentence):
ret = []
words = word_tokenize(sentence)
for word in words:
if word not in stopWords and word.isalpha():
ret.append(word)
return ret
def getSceneSentenceWords(scenes):
sentences = []
for scene in scenes:
sceneSentences = scene.split('\n')
for sentence in sceneSentences:
words = getWordsFromSentence(sentence)
if len(words) > 0:
sentences.append(words)
return sentences
def getAverageFromWords(sentences):
average = numpy.zeros(len(nlp(sentences[0][0]).vector))
for sentence in sentences:
for word in sentence:
average += nlp(word).vector
average /= len(sentences)
return average
def rankWords(sentences):
average = getAverageFromWords(sentences)
rank = []
for sentence in sentences:
rank += [[dist(average,nlp(word).vector), word] for word in sentence]
"""
for sentence in scenes:
#sceneSentence = nlp(sentence)
d = dist(average, sentence.vector) #numpy.linalg.norm(average, sceneSentence.vector)
if d < minDist:
minDist = d
summarySentence = sentence
return summarySentence
"""
rank.sort()
return (rank, average)
def rankSummaryWords(average, summarySentence):
words = getWordsFromSentence(summarySentence)
rank = []
rank += [[dist(average,nlp(word).vector), word] for word in words]
"""
for sentence in scenes:
#sceneSentence = nlp(sentence)
d = dist(average, sentence.vector) #numpy.linalg.norm(average, sceneSentence.vector)
if d < minDist:
minDist = d
summarySentence = sentence
return summarySentence
"""
rank.sort()
return rank
def averageSceneSummaryPerSentence(sentences, summary):
sim = 0
for sentence in sentences:
sim += nlp(summary).similarity(nlp(sentence))
sim /= len(sentences)
return sim
m = processAlignment.processAlignment()
print(m)
SP = ScriptProcessing.ScriptProcessing()
scenes = SP.extractScenes()
summary = readSummary()
#sentences = getSceneSentences([scenes[6]])
for i in range(len(scenes)):
sentences = getSceneSentences([scenes[i]])
print(i, averageSceneSummaryPerSentence(sentences, summary[1]))
"""
#sentences = getSceneSentenceWords([scenes[6]])
ss = getSceneSentences([scenes[6]])
ss2 = getSceneSentences([scenes[12]])
#(rank, average) = rankWords(sentences)
#SceneAverage = getAverageFromWords(sentences)
SceneAverageWSW = getAverage(ss)
ss2A = getAverage(ss2)
summaryAverage = getAverage(summary)
#summaryAverage = getAverageFromWords([getWordsFromSentence(summary[4])])
summaryAverageWSW = getAverage([summary[4]])
#print(dist(SceneAverage, summaryAverage))
print(dist(SceneAverageWSW, summaryAverageWSW))
print(dist(ss2A, SceneAverageWSW))
print(dist(summaryAverage, SceneAverageWSW))
#print("Scene Words Rank:")
#print(rank)
#print("Summary Words Rank:")
#print(rankSummaryWords(average, summary[1]))
#print(rank)
#average = getAverageFromWords(sentences)
#for summarySentenceNumber in m:
#sentences = []
#for scene in scenes:
# sentences.append(getSceneSentences([scene]))
summarization = []
i = 0
"""
"""
averages = []
for i in range(len(sentences)):
averages.append(getAverage(sentences[i]))
print("Averages Done!")
rank = rankScenesBySummarySentence(averages, summary[0])
print(rank)
"""
"""
for scene in sentences:
(summarySentence, average) = summarizeScene(scene)
summarization.append([summarySentence, average])
output.write('\n')
for i in range(len(summary)):
output.write(i,': ', alignSummarySentence(summarization, summary[i]), '\n\n')
"""
#sentences = getSceneSentences([scenes[i] for i in m[summarySentenceNumber]])
#summarySentence = nlp(summary[summarySentenceNumber])
#(rank, average) = rankSceneSentences(sentences)
#rankBySummary = rankSceneSentencesBySummary(sentences, summary[summarySentenceNumber])
#print(rankBySummary)
#summarizationSentence = nlp(rank[0][1])
#print(summarySentence)
#print(dist(summarySentence.vector, average))
#print(summarizationSentence)
#print(dist(summarizationSentence.vector, average))
#print(summarySentence.similarity(summarizationSentence))
#print('\n\n')