Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
"""
@Author: Arunav Mishra, Supratim Das
"""
from gensim import *
import pickle
import nltk.data
from nltk.tokenize import *
from nltk.corpus import stopwords
from nltk.stem.porter import *
import StaticFunctions as sf
from collections import defaultdict
import sys
from scipy import spatial
from collections import OrderedDict
from operator import itemgetter
class Similarity(object):
# Initialize
def __init__(self):
# NLP inputs
self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
self.stemmer = PorterStemmer()
self.tokenizer = RegexpTokenizer(r'\w+').tokenize
# LDA Specific Inputs
self.corpus = corpora.MmCorpus('/Users/Supra/PycharmProjects/LDA/Input/OP/corpus.mm')
self.dictionary = corpora.Dictionary.load('/Users/Supra/PycharmProjects/LDA/Input/OP/words.dict')
self.lda = models.ldamodel.LdaModel.load('/Users/Supra/PycharmProjects/LDA/Input/OP/lda')
self.index = similarities.MatrixSimilarity.load('/Users/Supra/PycharmProjects/LDA/Input/OP/lda.index')
self.obj = pickle.load(open('/Users/Supra/PycharmProjects/LDA/Input/OP/docIDMapping.txt',
'rb'))
# Objective Specific Inputs
self.stopset = set(stopwords.words('english'))
self.docs = sf.StaticFunctions.load_queries("/Users/Supra/PycharmProjects/LDA/Input/EventQrel.txt")
self.years = sf.StaticFunctions.load_years("/Users/Supra/PycharmProjects/LDA/Input/Unique_Years.txt")
# Computes the event vector
def create_event_vector(self):
texts = [self.stemmer.stem(word) for word in self.tokenizer(self.docs['Q53'].lower()) if word not in
self.stopset]
#for document in docs['Q'+str(sys.argv[0])]]
frequency = defaultdict(int)
for token in texts:
frequency[token] += 1
texts = [token for token in texts if frequency[token] > 0]
new_vec = self.dictionary.doc2bow(texts, allow_update=False, return_missing=False)
lda_vec_query = self.lda[new_vec]
return lda_vec_query
# Computes similarity between event vector and year
def compute_similarity(self, dense1):
similarity = {}
for year in self.years:
yearlist = [self.stemmer.stem(word) for word in self.tokenizer(year) if word not in self.stopset]
new_vec = self.dictionary.doc2bow(yearlist, allow_update=False, return_missing=False)
lda_vec_query2 = self.lda[new_vec]
dense2 = matutils.sparse2full(lda_vec_query2, self.lda.num_topics)
result = 1 - spatial.distance.cosine(dense1, dense2)
similarity[year] = result
return similarity
# Main--------------------------------------------------------------------------------------------------------------
def main(self):
# Create event vector
lda_vec_query = self.create_event_vector()
print(lda_vec_query)
dense1 = matutils.sparse2full(lda_vec_query, self.lda.num_topics)
# Compute similarity with years
result = self.compute_similarity(dense1)
# Sort the result with respect to magnitude
sorted_result = OrderedDict(sorted(result.items(), key=itemgetter(1)))
target = open("/Users/Supra/PycharmProjects/LDA/Input/OP/Q1.txt", 'w')
print(type(sorted_result))
# Write the output to destination
rank = 0
for r in sorted_result:
rank += 1
target.write(str(1) + "\t" + "Q0" + "\t" + str(r) + "\t" + str(rank) + "\t" + str(sorted_result[r]) + "\t"
+ "LDA" + "\n")
target.close()
if __name__ == '__main__':
Similarity().main()