Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
AligNarr/SummarySentence.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
94 lines (81 sloc)
2.64 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk import ngrams | |
nlp = spacy.load('en') | |
lemmatizer = WordNetLemmatizer() | |
stemmer = SnowballStemmer("english", ignore_stopwords=True) | |
stopWords = set(stopwords.words('english')) | |
N = 2 | |
class SummarySentence: | |
def __init__(self): | |
self.sentence = "" | |
self.persons = dict() | |
self.personsNgrams = set() | |
self.organizations = dict() | |
self.organizationsNgrams = set() | |
self.locations = dict() | |
self.locationsNgrams = set() | |
self.bagOfWords = set() | |
self.stemmed = False | |
self.Ngrams = set() | |
def _addOne(self, lemma, dictionary): | |
if lemma not in dictionary: | |
dictionary[lemma] = 1 | |
elif dictionary[lemma] > 0: | |
dictionary[lemma] = dictionary[lemma] + 1 | |
def getBagOfWords(self, model): | |
words = word_tokenize(self.sentence) | |
if self.stemmed: | |
stemmedWords = [stemmer.stem(word) for word in words] | |
for stem in stemmedWords: | |
if stem not in stopWords: | |
if stem in model.wv.vocab: | |
self.bagOfWords.add(stem) | |
else: | |
lemmatizedWords = [lemmatizer.lemmatize(word) for word in words] | |
for lemma in lemmatizedWords: | |
if lemma not in stopWords: | |
if lemma in model.wv.vocab: | |
self.bagOfWords.add(lemma) | |
def makeNgrams(self, dictionary): | |
Ngrams = set() | |
for entity in dictionary: | |
splitEntity = entity.split() | |
withoutStopWords = [] | |
for word in splitEntity: | |
if word not in stopWords: | |
withoutStopWords.append(word) | |
Ngrams |= set(gram for gram in ngrams(withoutStopWords,1)) | |
Ngrams |= set(gram for gram in ngrams(splitEntity,2)) | |
return Ngrams | |
def extractEntities(self, text, model): | |
self.sentence = text | |
self.getBagOfWords(model) | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.text != "\n": | |
if ent.label_ == 'PERSON': | |
self._addOne(ent.text.lower(), self.persons) | |
elif ent.label_ == 'ORG': | |
self._addOne(ent.text.lower(), self.organizations) | |
elif ent.label_ == 'GPE' or ent.label == 'LOC': | |
self._addOne(ent.text.lower(), self.locations) | |
self.personsNgrams = self.makeNgrams(self.persons) | |
self.locationsNgrams = self.makeNgrams(self.locations) | |
self.organizationsNgrams = self.makeNgrams(self.organizations) | |
self.Ngrams |= self.personsNgrams | |
self.Ngrams |= self.locationsNgrams | |
self.Ngrams |= self.organizationsNgrams | |
def output(self): | |
print(self.sentence) | |
print(self.persons) | |
print(self.personsNgrams) | |
print(self.organizations) | |
print(self.organizationsNgrams) | |
print(self.locations) | |
print(self.locationsNgrams) | |
print('\n') | |
print(self.bagOfWords) | |