Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AligNarr/ScriptScene.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
140 lines (121 sloc)
3.75 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
import spacy | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk import ngrams | |
nlp = spacy.load('en') | |
lemmatizer = WordNetLemmatizer() | |
stemmer = SnowballStemmer("english", ignore_stopwords=True) | |
stopWords = set(stopwords.words('english')) | |
N = 2 | |
class ScriptScene: | |
def __init__(self): | |
self.sentences = [] | |
self.speakers = dict() | |
self.speakersNgrams = set() | |
self.persons = dict() | |
self.personsNgrams = set() | |
self.organizations = dict() | |
self.organizationsNgrams = set() | |
self.locations = dict() | |
self.locationsNgrams = set() | |
self.bagOfWords = set() | |
self.stemmed = False | |
self.Ngrams = set() | |
def output(self): | |
print(self.persons) | |
print(self.organizations) | |
print(self.locations) | |
print(self.speakers) | |
print(self.sentences) | |
print('\n') | |
#print(self.bagOfWords) | |
print(self.Ngrams) | |
print('\n') | |
def _addOne(self, lemma, dictionary): | |
if lemma not in dictionary: | |
dictionary[lemma] = 1 | |
elif dictionary[lemma] > 0: | |
dictionary[lemma] = dictionary[lemma] + 1 | |
def _removeOne(self, lemma, dictionary): | |
if lemma not in dictionary: | |
return | |
elif dictionary[lemma] == 1: | |
del dictionary[lemma] | |
else: | |
dictionary[lemma] = dictionary[lemma] - 1 | |
def makeNgrams(self, dictionary): | |
Ngrams = set() | |
for entity in dictionary: | |
splitEntity = entity.split() | |
withoutStopWords = [] | |
for word in splitEntity: | |
if word not in stopWords: | |
withoutStopWords.append(word) | |
Ngrams |= set(gram for gram in ngrams(withoutStopWords,1)) | |
Ngrams |= set(gram for gram in ngrams(splitEntity,2)) | |
return Ngrams | |
def getEntities(self, scene): | |
consecutive = False | |
old_ne = '' | |
lemma = "" | |
for word in scene.iter('word'): | |
ne = word.get('ne') | |
if not (ne == 'ORGANIZATION' or ne == 'PERSON' or ne == 'LOCATION'): | |
old_ne = ne | |
continue | |
if ne == old_ne: | |
if ne == 'ORGANIZATION': | |
self._removeOne(lemma, self.organizations) | |
elif ne == 'PERSON': | |
self._removeOne(lemma, self.persons) | |
elif ne == 'LOCATION': | |
self._removeOne(lemma, self.locations) | |
lemma += ' ' + word.get('lemma').lower() | |
else: | |
lemma = word.get('lemma').lower() | |
if ne == 'ORGANIZATION': | |
self._addOne(lemma, self.organizations) | |
elif ne == 'PERSON': | |
self._addOne(lemma, self.persons) | |
elif ne == 'LOCATION': | |
self._addOne(lemma, self.locations) | |
old_ne = ne | |
self.personsNgrams = self.makeNgrams(self.persons) | |
self.locationsNgrams = self.makeNgrams(self.locations) | |
self.organizationsNgrams = self.makeNgrams(self.organizations) | |
self.Ngrams |= self.personsNgrams | |
self.Ngrams |= self.locationsNgrams | |
self.Ngrams |= self.organizationsNgrams | |
def getspeakers(self, scene): | |
for speech in scene.iter('speech'): | |
speakers = speech.get('speaker') | |
self._addOne(speakers.lower(), self.speakers) | |
self.speakersNgrams = self.makeNgrams(self.speakers) | |
self.Ngrams |= self.speakersNgrams | |
def getsentences(self, text): | |
self.sentences = text.split("\n") | |
def getBagOfWords(self, model): | |
scene = "" | |
for sentence in self.sentences: | |
scene += sentence | |
words = word_tokenize(scene) | |
if self.stemmed: | |
stemmedWords = [stemmer.stem(word) for word in words] | |
for stem in stemmedWords: | |
if stem not in stopWords: | |
if stem in model.wv.vocab: | |
self.bagOfWords.add(stem) | |
else: | |
lemmatizedWords = [lemmatizer.lemmatize(word) for word in words] | |
for lemma in lemmatizedWords: | |
if lemma not in stopWords: | |
if lemma in model.wv.vocab: | |
self.bagOfWords.add(lemma) | |
def processScene(self, scene, sentences, model): | |
self.getEntities(scene) | |
self.getspeakers(scene) | |
self.getsentences(sentences) | |
self.getBagOfWords(model) | |