Skip to content
Permalink
a077a7f39f
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
314 lines (258 sloc) 8.88 KB
'''
Created on Jul 19, 2019
@author: cxchu
'''
'''
ILP for each mention
- mention: string
- candidateTypes: set
- type2score, type2id, id2type: dictionary - like map
- disjointContr: list of pairs
- hierarchicalContr: list of pairs, each item is parent - child
- isLimit: boolean
- numLimit: integer
'''
import codecs
from ast import literal_eval
from scipy.stats import pearsonr as p
import time
from pulp import *
import optparse
import sys
def optimization(mention, candidateTypes, type2score, disjointContr, hierarchicalContr, pearsonValues, isLimit, numLimit, alpha):
prob = LpProblem("consolidation" + mention, LpMaximize)
T = {}
#variables
for key, v in type2score.items():
T[key] = LpVariable(key, 0, 1, LpContinuous)
Y = {}
for key, v in pearsonValues.items():
types = key.split(" - ")
t1 = types[0]
t2 = types[1]
if t1 in candidateTypes and t2 in candidateTypes:
Y[key] = LpVariable(key, 0, 1, LpBinary)
#obj function
obj = None
for key, v in type2score.items():
obj += T[key] * v
obj *= alpha
tmpobj = None
for key, v in pearsonValues.items():
types = key.split(" - ")
t1 = types[0]
t2 = types[1]
if t1 in candidateTypes and t2 in candidateTypes:
tmpobj += (1-alpha) * Y[key] * v
obj += tmpobj
prob += obj, "obj"
# total number of constraints
numCont = 0
#constraints
if isLimit == True:
sum = 0
for key,_ in type2score.items():
sum += T[key]
prob += sum <= numLimit, "c1"
numCont = numCont + 1
#disjointness
for t1, t2 in disjointContr:
if t1 in candidateTypes and t2 in candidateTypes:
contr = "c" + t1 + "-" + t2
prob += T[t1] + T[t2] <= 1, contr
numCont += 1
#hierarchical
for p, c in hierarchicalContr:
if p in candidateTypes and c in candidateTypes:
contr = "cp" + p + "-c" + c
prob += T[c] - T[p] <= 0, contr
numCont += 1
#pearson constraints
for key, v in pearsonValues.items():
types = key.split(" - ")
t1 = types[0]
t2 = types[1]
if t1 in candidateTypes and t2 in candidateTypes:
contr = "pearson" + t1 + "--" + t2
prob += T[t1] + T[t2] <= 1 + Y[key], contr
numCont += 1
# for i, t in enumerate(candidateTypes):
# for j, t2 in enumerate(candidateTypes):
# if j > i:
# contr = "pearson" + t + "-" + t2
# prob += T[t] + T[t2] <= 1 + T[t] * T[t2], contr
# Solve the problem using the default solver
prob.solve()
# Use prob.solve(GLPK()) instead to choose GLPK as the solver
# Use GLPK(msg = 0) to suppress GLPK messages
# If GLPK is not in your path and you lack the pulpGLPK module,
# replace GLPK() with GLPK("/path/")
# Where /path/ is the path to glpsol (excluding glpsol itself).
# If you want to use CPLEX, use CPLEX() instead of GLPK().
# If you want to use XPRESS, use XPRESS() instead of GLPK().
# If you want to use COIN, use COIN() instead of GLPK(). In this last case,
# two paths may be provided (one to clp, one to cbc).
# Print the status of the solved LP
# print("Status:", LpStatus[prob.status])
# Print the value of the variables at the optimum
# for v in prob.variables():
# print(v.name, "=", v.varValue)
# Print the value of the objective
# print("objective=", value(prob.objective))
return prob, numCont
optparser = optparse.OptionParser()
optparser.add_option(
"-b", "--basedir", default="/var/tmp/wikia/entity-typing/data-store/",
help="directory to model of top class prediction"
)
optparser.add_option(
"-u", "--universe", default="asphalt|london|peel",
help="reference universe"
)
optparser.add_option(
"-n", "--numType", default="5",
help="maximum number of predicted types"
)
opts = optparser.parse_args()[0]
basedir = opts.basedir
universesStr = opts.universe
numType = int(opts.numType)
start = time.time()
universes = universesStr.split("|")
print("\t\tLoading disjointness....")
disjointContr = []
print('input-disjoint')
sys.stdout.flush()
line = sys.stdin.readline()
while line != 'end-disjoint':
t = line.strip().split("\t")
if (t[0], t[1]) not in disjointContr:
disjointContr.append((t[0], t[1]))
line = sys.stdin.readline()
line = line.strip()
input_ilp = []
line = sys.stdin.readline()
while line != 'end-ilp-input':
input_ilp.append(line.strip())
line = sys.stdin.readline()
line = line.strip()
print('\t\tget all disjoint constraints')
sys.stdout.flush()
print('\t\tget all input for ilp')
sys.stdout.flush()
###load all candidate types
allCandidateTypes = set()
for line in input_ilp:
lines = line.split("=====")
mention = lines[0]
if len(lines[1]) < 5:
continue
candidates = lines[1][1:(len(lines[1])-2)].split(", ")
for type in candidates:
t2s = type.split("\t")
allCandidateTypes.add(t2s[0])
disjointGeneralContrFile = "general-class-disjoint"
for line in codecs.open(disjointGeneralContrFile, 'r', 'utf8'):
line = line.replace("\n", "")
t = line.split("\t")
if (t[0], t[1]) not in disjointContr:
disjointContr.append((t[0], t[1]))
print('\t\tLoading hierarchy...')
sys.stdout.flush
hierarchyContr = []
hierarchyContr.append(('geography', 'location'))
for univ in universes:
print('\t\t' + univ)
sys.stdout.flush
hierarchyContrFile = basedir + univ + "/hierarchy"
for line in codecs.open(hierarchyContrFile, 'r', 'utf8'):
t = line.split("\t")
if t[0] in allCandidateTypes and t[1] in allCandidateTypes:
hierarchyContr.append((t[0], t[1]))
# print('\t\tLoading within-universe disjointness...')
# sys.stdout.flush
#
# for univ in universes:
# print('\t\t' + univ)
# sys.stdout.flush
# disjointContrFile = basedir + univ + "/disjoint"
# for line in codecs.open(disjointContrFile, 'r', 'utf8'):
# t = line.split("\t")
# if t[0] in allCandidateTypes and t[1] in allCandidateTypes and (t[0], t[1]) not in disjointContr:
# disjointContr.append((t[0], t[1]))
print('\t\tloading occurrence.....')
sys.stdout.flush()
pearsonValues = {}
big_universes = {'camphalfbloodroleplay', 'fallout', 'enmarveldatabase',
'forgottenrealms', 'harrypotter', 'leagueoflegends', 'lego',
'psychology', 'starwars', 'villains', 'wowwiki', 'xenoblade'}
for univ in universes:
print('\t\t' + univ)
sys.stdout.flush()
if univ in big_universes:
continue;
occurrenceFile = basedir + univ + "/type-occurrence"
occurrences = {}
allTypes = []
for line in codecs.open(occurrenceFile, 'r', 'utf8'):
t = line.split("\t")
if t[0] in allCandidateTypes:
allTypes.append(t[0])
occurrences[t[0]] = literal_eval(t[1])
for i, val in enumerate(allTypes):
for j, valj in enumerate(allTypes):
if j > i:
key = val + " - " + valj
try:
with warnings.catch_warnings():
warnings.simplefilter("error", category=RuntimeWarning)
score, _ = p(occurrences[val], occurrences[valj])
except:
continue
pearsonValues[key] = score
# print(key + "\t" + str(score))
end = time.time()
print("\t\tLoading constraints....")
sys.stdout.flush
print('\t\t' + str(end - start))
# disjointContr.append(("characters", "location"))
results = {}
numInst = 0
numContAvg = 0
for line in input_ilp:
lines = line.split("=====")
mention = lines[0]
if len(lines[1]) < 5:
continue
candidates = lines[1][1:(len(lines[1])-2)].split(", ")
candidateTypes = set()
type2score = {}
for type in candidates:
t2s = type.split("\t")
candidateTypes.add(t2s[0])
type2score[t2s[0]] = float(t2s[1])
prob, numContr = optimization(mention, candidateTypes, type2score, disjointContr, hierarchyContr, pearsonValues, True, numType, 0.5)
results[mention] = prob
numInst += 1
numContAvg += numContr
print('\t\tfishing ilp computation...')
sys.stdout.flush()
print('\t\tAvg. of number of constraints:\t' + str(numContAvg/numInst))
print('ilp-results')
sys.stdout.flush()
for mention in results.keys():
resStr = ''
resStr += mention + "=====["
res = []
for v in results[mention].variables():
if v.varValue >= 0.5 and "___" not in v.name:
res.append(v.name + "\t" + str(v.varValue))
resStr += ", ".join([t for t in res])
resStr += "]"
print(resStr)
sys.stdout.flush()
print('end-ilp')
sys.stdout.flush()
end2 = time.time()
print("ILP solver...")
print(end2 - end)