attentionNER/top_class_predict.py

'''
Created on Jan 14, 2019

@author: cxchu
'''

import codecs
from sklearn.externals import joblib
import sys, os

from create_dataset import create_raw_dataset
from src.batcher import Batcher
from src.hook import acc_hook, save_predictions, evaluate_perclass
from src.model.nn_model import Model
import tensorflow as tf
import numpy as np
import pandas as pd
import optparse

optparser = optparse.OptionParser()
optparser.add_option(
    "-b", "--basedir", default="/var/tmp/wikia/entity-typing/deep-learning/",
    help="directory to model of top class prediction"
)

optparser.add_option(
    "-u", "--universe", default="aoc",
    help="reference universe"
)

opts = optparser.parse_args()[0]


refuniverse = opts.universe
basedir = opts.basedir
dict = basedir + refuniverse + "/data/dicts_gillick.pkl"

# inuniverse = "onion"
# raw_data = "/var/tmp/wikia/entity-typing/input-data/" + inuniverse + "/" + inuniverse + "-3-supervised"
# save_data = "/var/tmp/wikia/entity-typing/deep-learning/got/got_test.pkl"

dicts = joblib.load(dict)
label2id = dicts["label2id"]
id2label = dicts["id2label"]
word2id = dicts["word2id"]
feature2id = dicts["feature2id"]

if  "unknown" not in word2id:
    word2id["unknown"] = list(word2id.values())[0]
    
storage,data,sentences, mentions = create_raw_dataset(label2id,word2id,feature2id)
test_dataset = {"storage":storage,"data":data}
# joblib.dump(dataset,save_data)

print ("Loading the dataset")
# test_dataset = joblib.load(save_data)

print 
print ("test_size: ", test_dataset["data"].shape[0])

print ("Creating batchers")
# batch_size : 1000, context_length : 10
test_batcher = Batcher(test_dataset["storage"],test_dataset["data"],test_dataset["data"].shape[0],10,dicts["id2vec"])


print('Loading the model..............')
save_dir = basedir + refuniverse
model_name = 'model'

checkpoint_file = os.path.join(save_dir, model_name)
graph = tf.Graph()
with graph.as_default():
    sess = tf.Session()
    saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
    saver.restore(sess, checkpoint_file)
    
    keep_prob = graph.get_operation_by_name("keep_prob").outputs[0]
    mention_representation = graph.get_operation_by_name("mention_representation").outputs[0]
    
    context_length = 8
    context = [graph.get_operation_by_name("context" + str(i)).outputs[0] for i in range(context_length*2+1)]

    distribution = graph.get_operation_by_name("distribution").outputs[0]
    
    context_data, mention_representation_data, target_data, feature_data = test_batcher.next()
    
    feed = {mention_representation: mention_representation_data,
                keep_prob: [1.0]}
#     if self.feature == True and feature_data is not None:
#         feed[self.features] = feature_data
    for i in range(context_length*2+1):
        feed[context[i]] = context_data[:,i,:]
    scores = sess.run(distribution,feed_dict=feed)
    
    ################3
    # remove top class with too popular =====
    populardist = []
    type2freq = {}
    classFile = basedir + refuniverse + '/resource/label2id_gillick.txt'
    for line in codecs.open(classFile, "r", 'utf8'):
        tmp = line.split('\t')
        populardist.append(float(tmp[2]))
        type2freq[tmp[1]] = float(tmp[2])
    df = pd.DataFrame(populardist)
    threshold = float(df.quantile(1))
    print(threshold)
    
    #writing to file.....
    print('results')
    sys.stdout.flush()
    for sent, score in zip(mentions, scores):
        res = []
        for id, s in enumerate(list(score)):
            if s >= 0.5 and type2freq[id2label[id]] <= threshold:
                res.append(id2label[id] + "\t" + str(s))
        if len(res) > 0:
            print(sent + "=====[" + ", ".join([t for t in res]) + "]")
    print('end')
    sys.stdout.flush()