Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AligNarr/ExtractiveSummary.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
249 lines (223 sloc)
6.56 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import processAlignment | |
import ScriptProcessing | |
import spacy | |
import numpy | |
import math | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
stopWords = set(stopwords.words('english')) | |
nlp = spacy.load('en') | |
#similarity_threshold = 0.7 | |
dist_threshold = 6 | |
output = open('output.txt', 'w') | |
def getSceneSentences(scenes): | |
sentences = [] | |
for scene in scenes: | |
sceneSentences = scene.split('\n') | |
for sentence in sceneSentences: | |
if sentence != '': | |
sentences.append(sentence) | |
return sentences | |
def getAverage(sentences): | |
average = numpy.zeros(len(nlp(sentences[0]).vector)) | |
for sentence in sentences: | |
average += nlp(sentence).vector | |
average /= len(sentences) | |
return average | |
def dist(v1, v2): | |
v = v1-v2 | |
sum = 0 | |
for i in range(len(v)): | |
sum += (v[i] * v[i]) | |
return math.sqrt(sum) | |
def readSummary(): | |
summaryFile = open('wikiplot.txt', 'r') | |
summ = summaryFile.readlines() | |
summary = [] | |
for sentence in summ: | |
if sentence != '' and sentence != '\n': | |
summary.append(sentence) | |
return summary | |
def rankSceneSentences(scenes): | |
average = getAverage(scenes) | |
#minDist = 1000000000 | |
summarySentence = "" | |
rank = [[dist(average,nlp(sentence).vector), sentence] for sentence in scenes] | |
""" | |
for sentence in scenes: | |
#sceneSentence = nlp(sentence) | |
d = dist(average, sentence.vector) #numpy.linalg.norm(average, sceneSentence.vector) | |
if d < minDist: | |
minDist = d | |
summarySentence = sentence | |
return summarySentence | |
""" | |
rank.sort() | |
return (rank, average) | |
""" | |
def summarizeScene(scenes): | |
average = getAverage(scenes) | |
minDist = 1000000000 | |
summarySentence = "" | |
for sentence in scenes: | |
sceneSentence = nlp(sentence) | |
d = dist(average, sceneSentence.vector) #numpy.linalg.norm(average, sceneSentence.vector) | |
if d < minDist: | |
minDist = d | |
summarySentence = sentence | |
return (summarySentence, average) | |
""" | |
def rankScenesBySummarySentence(averages, summarySentence): | |
summary = nlp(summarySentence) | |
rank = [] | |
for i in range (len(averages)): | |
print(i) | |
d = dist(averages[i], summary.vector) | |
rank.append((d, i)) | |
rank.sort() | |
return rank | |
""" | |
def rankSceneSentencesBySummary(scenes, summary): | |
rank = [[nlp(sentence).similarity(nlp(summary)), sentence] for sentence in scenes] | |
rank.sort() | |
return rank | |
""" | |
""" | |
def alignSummarySentence(summarization, summarySentence): | |
alignment = [] | |
for i in range(len(summarization)): | |
#summarizationSentence = summarization[i][0] | |
average = summarization[i][1] | |
#if nlp(summarySentence).similarity(nlp(summarizationSentence)) > inclusion_threshold: | |
d = dist(average, nlp(summarySentence).vector) | |
alignment.append([d, i]) | |
#if d < dist_threshold: | |
#alignment.append(i) | |
alignment.sort() | |
return alignment | |
""" | |
def getWordsFromSentence(sentence): | |
ret = [] | |
words = word_tokenize(sentence) | |
for word in words: | |
if word not in stopWords and word.isalpha(): | |
ret.append(word) | |
return ret | |
def getSceneSentenceWords(scenes): | |
sentences = [] | |
for scene in scenes: | |
sceneSentences = scene.split('\n') | |
for sentence in sceneSentences: | |
words = getWordsFromSentence(sentence) | |
if len(words) > 0: | |
sentences.append(words) | |
return sentences | |
def getAverageFromWords(sentences): | |
average = numpy.zeros(len(nlp(sentences[0][0]).vector)) | |
for sentence in sentences: | |
for word in sentence: | |
average += nlp(word).vector | |
average /= len(sentences) | |
return average | |
def rankWords(sentences): | |
average = getAverageFromWords(sentences) | |
rank = [] | |
for sentence in sentences: | |
rank += [[dist(average,nlp(word).vector), word] for word in sentence] | |
""" | |
for sentence in scenes: | |
#sceneSentence = nlp(sentence) | |
d = dist(average, sentence.vector) #numpy.linalg.norm(average, sceneSentence.vector) | |
if d < minDist: | |
minDist = d | |
summarySentence = sentence | |
return summarySentence | |
""" | |
rank.sort() | |
return (rank, average) | |
def rankSummaryWords(average, summarySentence): | |
words = getWordsFromSentence(summarySentence) | |
rank = [] | |
rank += [[dist(average,nlp(word).vector), word] for word in words] | |
""" | |
for sentence in scenes: | |
#sceneSentence = nlp(sentence) | |
d = dist(average, sentence.vector) #numpy.linalg.norm(average, sceneSentence.vector) | |
if d < minDist: | |
minDist = d | |
summarySentence = sentence | |
return summarySentence | |
""" | |
rank.sort() | |
return rank | |
def averageSceneSummaryPerSentence(sentences, summary): | |
sim = 0 | |
for sentence in sentences: | |
sim += nlp(summary).similarity(nlp(sentence)) | |
sim /= len(sentences) | |
return sim | |
m = processAlignment.processAlignment() | |
print(m) | |
SP = ScriptProcessing.ScriptProcessing() | |
scenes = SP.extractScenes() | |
summary = readSummary() | |
#sentences = getSceneSentences([scenes[6]]) | |
for i in range(len(scenes)): | |
sentences = getSceneSentences([scenes[i]]) | |
print(i, averageSceneSummaryPerSentence(sentences, summary[1])) | |
""" | |
#sentences = getSceneSentenceWords([scenes[6]]) | |
ss = getSceneSentences([scenes[6]]) | |
ss2 = getSceneSentences([scenes[12]]) | |
#(rank, average) = rankWords(sentences) | |
#SceneAverage = getAverageFromWords(sentences) | |
SceneAverageWSW = getAverage(ss) | |
ss2A = getAverage(ss2) | |
summaryAverage = getAverage(summary) | |
#summaryAverage = getAverageFromWords([getWordsFromSentence(summary[4])]) | |
summaryAverageWSW = getAverage([summary[4]]) | |
#print(dist(SceneAverage, summaryAverage)) | |
print(dist(SceneAverageWSW, summaryAverageWSW)) | |
print(dist(ss2A, SceneAverageWSW)) | |
print(dist(summaryAverage, SceneAverageWSW)) | |
#print("Scene Words Rank:") | |
#print(rank) | |
#print("Summary Words Rank:") | |
#print(rankSummaryWords(average, summary[1])) | |
#print(rank) | |
#average = getAverageFromWords(sentences) | |
#for summarySentenceNumber in m: | |
#sentences = [] | |
#for scene in scenes: | |
# sentences.append(getSceneSentences([scene])) | |
summarization = [] | |
i = 0 | |
""" | |
""" | |
averages = [] | |
for i in range(len(sentences)): | |
averages.append(getAverage(sentences[i])) | |
print("Averages Done!") | |
rank = rankScenesBySummarySentence(averages, summary[0]) | |
print(rank) | |
""" | |
""" | |
for scene in sentences: | |
(summarySentence, average) = summarizeScene(scene) | |
summarization.append([summarySentence, average]) | |
output.write('\n') | |
for i in range(len(summary)): | |
output.write(i,': ', alignSummarySentence(summarization, summary[i]), '\n\n') | |
""" | |
#sentences = getSceneSentences([scenes[i] for i in m[summarySentenceNumber]]) | |
#summarySentence = nlp(summary[summarySentenceNumber]) | |
#(rank, average) = rankSceneSentences(sentences) | |
#rankBySummary = rankSceneSentencesBySummary(sentences, summary[summarySentenceNumber]) | |
#print(rankBySummary) | |
#summarizationSentence = nlp(rank[0][1]) | |
#print(summarySentence) | |
#print(dist(summarySentence.vector, average)) | |
#print(summarizationSentence) | |
#print(dist(summarizationSentence.vector, average)) | |
#print(summarySentence.similarity(summarizationSentence)) | |
#print('\n\n') |