Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import xml.etree.ElementTree as ET
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams
nlp = spacy.load('en')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stopWords = set(stopwords.words('english'))
N = 2
class ScriptScene:
def __init__(self):
self.sentences = []
self.speakers = dict()
self.speakersNgrams = set()
self.persons = dict()
self.personsNgrams = set()
self.organizations = dict()
self.organizationsNgrams = set()
self.locations = dict()
self.locationsNgrams = set()
self.bagOfWords = set()
self.stemmed = False
self.Ngrams = set()
def output(self):
print(self.persons)
print(self.organizations)
print(self.locations)
print(self.speakers)
print(self.sentences)
print('\n')
#print(self.bagOfWords)
print(self.Ngrams)
print('\n')
def _addOne(self, lemma, dictionary):
if lemma not in dictionary:
dictionary[lemma] = 1
elif dictionary[lemma] > 0:
dictionary[lemma] = dictionary[lemma] + 1
def _removeOne(self, lemma, dictionary):
if lemma not in dictionary:
return
elif dictionary[lemma] == 1:
del dictionary[lemma]
else:
dictionary[lemma] = dictionary[lemma] - 1
def makeNgrams(self, dictionary):
Ngrams = set()
for entity in dictionary:
splitEntity = entity.split()
withoutStopWords = []
for word in splitEntity:
if word not in stopWords:
withoutStopWords.append(word)
Ngrams |= set(gram for gram in ngrams(withoutStopWords,1))
Ngrams |= set(gram for gram in ngrams(splitEntity,2))
return Ngrams
def getEntities(self, scene):
consecutive = False
old_ne = ''
lemma = ""
for word in scene.iter('word'):
ne = word.get('ne')
if not (ne == 'ORGANIZATION' or ne == 'PERSON' or ne == 'LOCATION'):
old_ne = ne
continue
if ne == old_ne:
if ne == 'ORGANIZATION':
self._removeOne(lemma, self.organizations)
elif ne == 'PERSON':
self._removeOne(lemma, self.persons)
elif ne == 'LOCATION':
self._removeOne(lemma, self.locations)
lemma += ' ' + word.get('lemma').lower()
else:
lemma = word.get('lemma').lower()
if ne == 'ORGANIZATION':
self._addOne(lemma, self.organizations)
elif ne == 'PERSON':
self._addOne(lemma, self.persons)
elif ne == 'LOCATION':
self._addOne(lemma, self.locations)
old_ne = ne
self.personsNgrams = self.makeNgrams(self.persons)
self.locationsNgrams = self.makeNgrams(self.locations)
self.organizationsNgrams = self.makeNgrams(self.organizations)
self.Ngrams |= self.personsNgrams
self.Ngrams |= self.locationsNgrams
self.Ngrams |= self.organizationsNgrams
def getspeakers(self, scene):
for speech in scene.iter('speech'):
speakers = speech.get('speaker')
self._addOne(speakers.lower(), self.speakers)
self.speakersNgrams = self.makeNgrams(self.speakers)
self.Ngrams |= self.speakersNgrams
def getsentences(self, text):
self.sentences = text.split("\n")
def getBagOfWords(self, model):
scene = ""
for sentence in self.sentences:
scene += sentence
words = word_tokenize(scene)
if self.stemmed:
stemmedWords = [stemmer.stem(word) for word in words]
for stem in stemmedWords:
if stem not in stopWords:
if stem in model.wv.vocab:
self.bagOfWords.add(stem)
else:
lemmatizedWords = [lemmatizer.lemmatize(word) for word in words]
for lemma in lemmatizedWords:
if lemma not in stopWords:
if lemma in model.wv.vocab:
self.bagOfWords.add(lemma)
def processScene(self, scene, sentences, model):
self.getEntities(scene)
self.getspeakers(scene)
self.getsentences(sentences)
self.getBagOfWords(model)