Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams
nlp = spacy.load('en')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stopWords = set(stopwords.words('english'))
N = 2
class SummarySentence:
def __init__(self):
self.sentence = ""
self.persons = dict()
self.personsNgrams = set()
self.organizations = dict()
self.organizationsNgrams = set()
self.locations = dict()
self.locationsNgrams = set()
self.bagOfWords = set()
self.stemmed = False
self.Ngrams = set()
def _addOne(self, lemma, dictionary):
if lemma not in dictionary:
dictionary[lemma] = 1
elif dictionary[lemma] > 0:
dictionary[lemma] = dictionary[lemma] + 1
def getBagOfWords(self, model):
words = word_tokenize(self.sentence)
if self.stemmed:
stemmedWords = [stemmer.stem(word) for word in words]
for stem in stemmedWords:
if stem not in stopWords:
if stem in model.wv.vocab:
self.bagOfWords.add(stem)
else:
lemmatizedWords = [lemmatizer.lemmatize(word) for word in words]
for lemma in lemmatizedWords:
if lemma not in stopWords:
if lemma in model.wv.vocab:
self.bagOfWords.add(lemma)
def makeNgrams(self, dictionary):
Ngrams = set()
for entity in dictionary:
splitEntity = entity.split()
withoutStopWords = []
for word in splitEntity:
if word not in stopWords:
withoutStopWords.append(word)
Ngrams |= set(gram for gram in ngrams(withoutStopWords,1))
Ngrams |= set(gram for gram in ngrams(splitEntity,2))
return Ngrams
def extractEntities(self, text, model):
self.sentence = text
self.getBagOfWords(model)
doc = nlp(text)
for ent in doc.ents:
if ent.text != "\n":
if ent.label_ == 'PERSON':
self._addOne(ent.text.lower(), self.persons)
elif ent.label_ == 'ORG':
self._addOne(ent.text.lower(), self.organizations)
elif ent.label_ == 'GPE' or ent.label == 'LOC':
self._addOne(ent.text.lower(), self.locations)
self.personsNgrams = self.makeNgrams(self.persons)
self.locationsNgrams = self.makeNgrams(self.locations)
self.organizationsNgrams = self.makeNgrams(self.organizations)
self.Ngrams |= self.personsNgrams
self.Ngrams |= self.locationsNgrams
self.Ngrams |= self.organizationsNgrams
def output(self):
print(self.sentence)
print(self.persons)
print(self.personsNgrams)
print(self.organizations)
print(self.organizationsNgrams)
print(self.locations)
print(self.locationsNgrams)
print('\n')
print(self.bagOfWords)