TypeConsolidation.py

'''
Created on Jul 19, 2019

@author: cxchu
'''

'''
ILP for each mention
- mention: string
- candidateTypes: set
- type2score, type2id, id2type: dictionary - like map
- disjointContr: list of pairs
- hierarchicalContr: list of pairs, each item is parent - child
- isLimit: boolean
- numLimit: integer
'''

import codecs
from ast import literal_eval
from scipy.stats import pearsonr as p
import time
from pulp import *
import optparse
import sys

def optimization(mention, candidateTypes, type2score, disjointContr, hierarchicalContr, pearsonValues, isLimit, numLimit, alpha):
    prob = LpProblem("consolidation" + mention, LpMaximize)
    T = {}
    #variables
    for key, v in type2score.items():
        T[key] = LpVariable(key, 0, 1, LpContinuous)

    Y = {}
    for key, v in pearsonValues.items():
        types = key.split(" - ")
        t1 = types[0]
        t2 = types[1]
        if t1 in candidateTypes and t2 in candidateTypes:
            Y[key] = LpVariable(key, 0, 1, LpBinary)
    #obj function
    obj = None
    for key, v in type2score.items():
        obj += T[key] * v
    obj *= alpha

    tmpobj = None
    for key, v in pearsonValues.items():
        types = key.split(" - ")
        t1 = types[0]
        t2 = types[1]
        if t1 in candidateTypes and t2 in candidateTypes:
            tmpobj += (1-alpha) * Y[key] * v
    obj += tmpobj

    prob += obj, "obj"
    # total number of constraints
    numCont = 0

    #constraints
    if isLimit == True:
        sum = 0
        for key,_ in type2score.items():
            sum += T[key]
        prob += sum <= numLimit, "c1"
        numCont = numCont + 1
    #disjointness
    for t1, t2 in disjointContr:
        if t1 in candidateTypes and t2 in candidateTypes:
            contr = "c" + t1 + "-" + t2
            prob += T[t1] + T[t2] <= 1, contr
            numCont += 1
    #hierarchical
    for p, c in hierarchicalContr:
        if p in candidateTypes and c in candidateTypes:
            contr = "cp" + p + "-c" + c
            prob += T[c] - T[p] <= 0, contr
            numCont += 1
    #pearson constraints
    for key, v in pearsonValues.items():
        types = key.split(" - ")
        t1 = types[0]
        t2 = types[1]
        if t1 in candidateTypes and t2 in candidateTypes:
            contr = "pearson" + t1 + "--" + t2
            prob += T[t1] + T[t2] <= 1 + Y[key], contr
            numCont += 1
#     for i, t in enumerate(candidateTypes):
#         for j, t2 in enumerate(candidateTypes):
#             if j > i:
#                 contr = "pearson" + t + "-" + t2
#                 prob += T[t] + T[t2] <= 1 + T[t] * T[t2], contr

    # Solve the problem using the default solver
    prob.solve()
    # Use prob.solve(GLPK()) instead to choose GLPK as the solver
    # Use GLPK(msg = 0) to suppress GLPK messages
    # If GLPK is not in your path and you lack the pulpGLPK module,
    # replace GLPK() with GLPK("/path/")
    # Where /path/ is the path to glpsol (excluding glpsol itself).
    # If you want to use CPLEX, use CPLEX() instead of GLPK().
    # If you want to use XPRESS, use XPRESS() instead of GLPK().
    # If you want to use COIN, use COIN() instead of GLPK(). In this last case,
    # two paths may be provided (one to clp, one to cbc).

    # Print the status of the solved LP
#     print("Status:", LpStatus[prob.status])

    # Print the value of the variables at the optimum
#     for v in prob.variables():
#         print(v.name, "=", v.varValue)

    # Print the value of the objective
#     print("objective=", value(prob.objective))
    return prob, numCont


optparser = optparse.OptionParser()
optparser.add_option(
    "-b", "--basedir", default="/var/tmp/wikia/entity-typing/data-store/",
    help="directory to model of top class prediction"
)

optparser.add_option(
    "-u", "--universe", default="asphalt|london|peel",
    help="reference universe"
)

optparser.add_option(
    "-n", "--numType", default="5",
    help="maximum number of predicted types"
)

opts = optparser.parse_args()[0]

basedir = opts.basedir
universesStr = opts.universe
numType = int(opts.numType)

start = time.time()

universes = universesStr.split("|")

print("\t\tLoading disjointness....")

disjointContr = []


print('input-disjoint')
sys.stdout.flush()

line = sys.stdin.readline()
while line != 'end-disjoint':
    t = line.strip().split("\t")
    if (t[0], t[1]) not in disjointContr:
        disjointContr.append((t[0], t[1]))
    line = sys.stdin.readline()
    line = line.strip()

input_ilp = []
line = sys.stdin.readline()
while line != 'end-ilp-input':
    input_ilp.append(line.strip())
    line = sys.stdin.readline()
    line = line.strip()

print('\t\tget all disjoint constraints')
sys.stdout.flush()
print('\t\tget all input for ilp')
sys.stdout.flush()


###load all candidate types
allCandidateTypes = set()

for line in input_ilp:
    lines = line.split("=====")
    mention = lines[0]
    if len(lines[1]) < 5:
        continue
    candidates = lines[1][1:(len(lines[1])-2)].split(", ")
    for type in candidates:
        t2s = type.split("\t")
        allCandidateTypes.add(t2s[0])


disjointGeneralContrFile = "general-class-disjoint"
for line in codecs.open(disjointGeneralContrFile, 'r', 'utf8'):
    line = line.replace("\n", "")
    t = line.split("\t")
    if (t[0], t[1]) not in disjointContr:
        disjointContr.append((t[0], t[1]))

print('\t\tLoading hierarchy...')
sys.stdout.flush

hierarchyContr = []

hierarchyContr.append(('geography', 'location'))

for univ in universes:
    print('\t\t' + univ)
    sys.stdout.flush
    hierarchyContrFile = basedir + univ + "/hierarchy"
    for line in codecs.open(hierarchyContrFile, 'r', 'utf8'):
        t = line.split("\t")
        if t[0] in allCandidateTypes and t[1] in allCandidateTypes:
            hierarchyContr.append((t[0], t[1]))

# print('\t\tLoading within-universe disjointness...')
# sys.stdout.flush
#
# for univ in universes:
#     print('\t\t' + univ)
#     sys.stdout.flush
#     disjointContrFile = basedir + univ + "/disjoint"
#     for line in codecs.open(disjointContrFile, 'r', 'utf8'):
#         t = line.split("\t")
#         if t[0] in allCandidateTypes and t[1] in allCandidateTypes and (t[0], t[1]) not in disjointContr:
#             disjointContr.append((t[0], t[1]))


print('\t\tloading occurrence.....')
sys.stdout.flush()
pearsonValues = {}
big_universes = {'camphalfbloodroleplay', 'fallout', 'enmarveldatabase',
                'forgottenrealms', 'harrypotter', 'leagueoflegends', 'lego',
                'psychology', 'starwars', 'villains', 'wowwiki', 'xenoblade'}
for univ in universes:
    print('\t\t' + univ)
    sys.stdout.flush()
    if univ in big_universes:
        continue;
    occurrenceFile = basedir + univ + "/type-occurrence"
    occurrences = {}
    allTypes = []
    for line in codecs.open(occurrenceFile, 'r', 'utf8'):
        t = line.split("\t")
        if t[0] in allCandidateTypes:
            allTypes.append(t[0])
            occurrences[t[0]] = literal_eval(t[1])

    for i, val in enumerate(allTypes):
        for j, valj in enumerate(allTypes):
            if j > i:
                key = val + " - " + valj
                try:
                    with warnings.catch_warnings():
                        warnings.simplefilter("error", category=RuntimeWarning)
                        score, _ = p(occurrences[val], occurrences[valj])
                except:
                    continue
                pearsonValues[key] = score
    #             print(key + "\t" + str(score))


end = time.time()
print("\t\tLoading constraints....")
sys.stdout.flush
print('\t\t' + str(end - start))

# disjointContr.append(("characters", "location"))

results = {}

numInst = 0
numContAvg = 0

for line in input_ilp:
    lines = line.split("=====")
    mention = lines[0]
    if len(lines[1]) < 5:
        continue
    candidates = lines[1][1:(len(lines[1])-2)].split(", ")

    candidateTypes = set()
    type2score = {}
    for type in candidates:
        t2s = type.split("\t")
        candidateTypes.add(t2s[0])
        type2score[t2s[0]] = float(t2s[1])
    prob, numContr = optimization(mention, candidateTypes, type2score, disjointContr, hierarchyContr, pearsonValues, True, numType, 0.5)
    results[mention] = prob
    numInst += 1
    numContAvg += numContr

print('\t\tfishing ilp computation...')
sys.stdout.flush()

print('\t\tAvg. of number of constraints:\t' + str(numContAvg/numInst))

print('ilp-results')
sys.stdout.flush()
for mention in results.keys():
    resStr = ''
    resStr += mention + "=====["
    res = []
    for v in results[mention].variables():
        if v.varValue >= 0.5 and "___" not in v.name:
            res.append(v.name + "\t" + str(v.varValue))
    resStr += ", ".join([t for t in res])
    resStr += "]"
    print(resStr)
    sys.stdout.flush()
print('end-ilp')
sys.stdout.flush()

end2 = time.time()
print("ILP solver...")
print(end2 - end)
	'''
	Created on Jul 19, 2019

	@author: cxchu
	'''

	'''
	ILP for each mention
	- mention: string
	- candidateTypes: set
	- type2score, type2id, id2type: dictionary - like map
	- disjointContr: list of pairs
	- hierarchicalContr: list of pairs, each item is parent - child
	- isLimit: boolean
	- numLimit: integer
	'''

	import codecs
	from ast import literal_eval
	from scipy.stats import pearsonr as p
	import time
	from pulp import *
	import optparse
	import sys

	def optimization(mention, candidateTypes, type2score, disjointContr, hierarchicalContr, pearsonValues, isLimit, numLimit, alpha):
	prob = LpProblem("consolidation" + mention, LpMaximize)
	T = {}
	#variables
	for key, v in type2score.items():
	T[key] = LpVariable(key, 0, 1, LpContinuous)

	Y = {}
	for key, v in pearsonValues.items():
	types = key.split(" - ")
	t1 = types[0]
	t2 = types[1]
	if t1 in candidateTypes and t2 in candidateTypes:
	Y[key] = LpVariable(key, 0, 1, LpBinary)
	#obj function
	obj = None
	for key, v in type2score.items():
	obj += T[key] * v
	obj *= alpha

	tmpobj = None
	for key, v in pearsonValues.items():
	types = key.split(" - ")
	t1 = types[0]
	t2 = types[1]
	if t1 in candidateTypes and t2 in candidateTypes:
	tmpobj += (1-alpha) * Y[key] * v
	obj += tmpobj

	prob += obj, "obj"
	# total number of constraints
	numCont = 0

	#constraints
	if isLimit == True:
	sum = 0
	for key,_ in type2score.items():
	sum += T[key]
	prob += sum <= numLimit, "c1"
	numCont = numCont + 1
	#disjointness
	for t1, t2 in disjointContr:
	if t1 in candidateTypes and t2 in candidateTypes:
	contr = "c" + t1 + "-" + t2
	prob += T[t1] + T[t2] <= 1, contr
	numCont += 1
	#hierarchical
	for p, c in hierarchicalContr:
	if p in candidateTypes and c in candidateTypes:
	contr = "cp" + p + "-c" + c
	prob += T[c] - T[p] <= 0, contr
	numCont += 1
	#pearson constraints
	for key, v in pearsonValues.items():
	types = key.split(" - ")
	t1 = types[0]
	t2 = types[1]
	if t1 in candidateTypes and t2 in candidateTypes:
	contr = "pearson" + t1 + "--" + t2
	prob += T[t1] + T[t2] <= 1 + Y[key], contr
	numCont += 1
	# for i, t in enumerate(candidateTypes):
	# for j, t2 in enumerate(candidateTypes):
	# if j > i:
	# contr = "pearson" + t + "-" + t2
	# prob += T[t] + T[t2] <= 1 + T[t] * T[t2], contr

	# Solve the problem using the default solver
	prob.solve()
	# Use prob.solve(GLPK()) instead to choose GLPK as the solver
	# Use GLPK(msg = 0) to suppress GLPK messages
	# If GLPK is not in your path and you lack the pulpGLPK module,
	# replace GLPK() with GLPK("/path/")
	# Where /path/ is the path to glpsol (excluding glpsol itself).
	# If you want to use CPLEX, use CPLEX() instead of GLPK().
	# If you want to use XPRESS, use XPRESS() instead of GLPK().
	# If you want to use COIN, use COIN() instead of GLPK(). In this last case,
	# two paths may be provided (one to clp, one to cbc).

	# Print the status of the solved LP
	# print("Status:", LpStatus[prob.status])

	# Print the value of the variables at the optimum
	# for v in prob.variables():
	# print(v.name, "=", v.varValue)

	# Print the value of the objective
	# print("objective=", value(prob.objective))
	return prob, numCont


	optparser = optparse.OptionParser()
	optparser.add_option(
	"-b", "--basedir", default="/var/tmp/wikia/entity-typing/data-store/",
	help="directory to model of top class prediction"
	)

	optparser.add_option(
	"-u", "--universe", default="asphalt\|london\|peel",
	help="reference universe"
	)

	optparser.add_option(
	"-n", "--numType", default="5",
	help="maximum number of predicted types"
	)

	opts = optparser.parse_args()[0]

	basedir = opts.basedir
	universesStr = opts.universe
	numType = int(opts.numType)

	start = time.time()

	universes = universesStr.split("\|")

	print("\t\tLoading disjointness....")

	disjointContr = []


	print('input-disjoint')
	sys.stdout.flush()

	line = sys.stdin.readline()
	while line != 'end-disjoint':
	t = line.strip().split("\t")
	if (t[0], t[1]) not in disjointContr:
	disjointContr.append((t[0], t[1]))
	line = sys.stdin.readline()
	line = line.strip()

	input_ilp = []
	line = sys.stdin.readline()
	while line != 'end-ilp-input':
	input_ilp.append(line.strip())
	line = sys.stdin.readline()
	line = line.strip()

	print('\t\tget all disjoint constraints')
	sys.stdout.flush()
	print('\t\tget all input for ilp')
	sys.stdout.flush()


	###load all candidate types
	allCandidateTypes = set()

	for line in input_ilp:
	lines = line.split("=====")
	mention = lines[0]
	if len(lines[1]) < 5:
	continue
	candidates = lines[1][1:(len(lines[1])-2)].split(", ")
	for type in candidates:
	t2s = type.split("\t")
	allCandidateTypes.add(t2s[0])


	disjointGeneralContrFile = "general-class-disjoint"
	for line in codecs.open(disjointGeneralContrFile, 'r', 'utf8'):
	line = line.replace("\n", "")
	t = line.split("\t")
	if (t[0], t[1]) not in disjointContr:
	disjointContr.append((t[0], t[1]))

	print('\t\tLoading hierarchy...')
	sys.stdout.flush

	hierarchyContr = []

	hierarchyContr.append(('geography', 'location'))

	for univ in universes:
	print('\t\t' + univ)
	sys.stdout.flush
	hierarchyContrFile = basedir + univ + "/hierarchy"
	for line in codecs.open(hierarchyContrFile, 'r', 'utf8'):
	t = line.split("\t")
	if t[0] in allCandidateTypes and t[1] in allCandidateTypes:
	hierarchyContr.append((t[0], t[1]))

	# print('\t\tLoading within-universe disjointness...')
	# sys.stdout.flush
	#
	# for univ in universes:
	# print('\t\t' + univ)
	# sys.stdout.flush
	# disjointContrFile = basedir + univ + "/disjoint"
	# for line in codecs.open(disjointContrFile, 'r', 'utf8'):
	# t = line.split("\t")
	# if t[0] in allCandidateTypes and t[1] in allCandidateTypes and (t[0], t[1]) not in disjointContr:
	# disjointContr.append((t[0], t[1]))


	print('\t\tloading occurrence.....')
	sys.stdout.flush()
	pearsonValues = {}
	big_universes = {'camphalfbloodroleplay', 'fallout', 'enmarveldatabase',
	'forgottenrealms', 'harrypotter', 'leagueoflegends', 'lego',
	'psychology', 'starwars', 'villains', 'wowwiki', 'xenoblade'}
	for univ in universes:
	print('\t\t' + univ)
	sys.stdout.flush()
	if univ in big_universes:
	continue;
	occurrenceFile = basedir + univ + "/type-occurrence"
	occurrences = {}
	allTypes = []
	for line in codecs.open(occurrenceFile, 'r', 'utf8'):
	t = line.split("\t")
	if t[0] in allCandidateTypes:
	allTypes.append(t[0])
	occurrences[t[0]] = literal_eval(t[1])

	for i, val in enumerate(allTypes):
	for j, valj in enumerate(allTypes):
	if j > i:
	key = val + " - " + valj
	try:
	with warnings.catch_warnings():
	warnings.simplefilter("error", category=RuntimeWarning)
	score, _ = p(occurrences[val], occurrences[valj])
	except:
	continue
	pearsonValues[key] = score
	# print(key + "\t" + str(score))


	end = time.time()
	print("\t\tLoading constraints....")
	sys.stdout.flush
	print('\t\t' + str(end - start))

	# disjointContr.append(("characters", "location"))

	results = {}

	numInst = 0
	numContAvg = 0

	for line in input_ilp:
	lines = line.split("=====")
	mention = lines[0]
	if len(lines[1]) < 5:
	continue
	candidates = lines[1][1:(len(lines[1])-2)].split(", ")

	candidateTypes = set()
	type2score = {}
	for type in candidates:
	t2s = type.split("\t")
	candidateTypes.add(t2s[0])
	type2score[t2s[0]] = float(t2s[1])
	prob, numContr = optimization(mention, candidateTypes, type2score, disjointContr, hierarchyContr, pearsonValues, True, numType, 0.5)
	results[mention] = prob
	numInst += 1
	numContAvg += numContr

	print('\t\tfishing ilp computation...')
	sys.stdout.flush()

	print('\t\tAvg. of number of constraints:\t' + str(numContAvg/numInst))

	print('ilp-results')
	sys.stdout.flush()
	for mention in results.keys():
	resStr = ''
	resStr += mention + "=====["
	res = []
	for v in results[mention].variables():
	if v.varValue >= 0.5 and "___" not in v.name:
	res.append(v.name + "\t" + str(v.varValue))
	resStr += ", ".join([t for t in res])
	resStr += "]"
	print(resStr)
	sys.stdout.flush()
	print('end-ilp')
	sys.stdout.flush()

	end2 = time.time()
	print("ILP solver...")
	print(end2 - end)