Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ENTYFI/typeConsolidation/TypeConsolidation.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
314 lines (258 sloc)
8.88 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Created on Jul 19, 2019 | |
@author: cxchu | |
''' | |
''' | |
ILP for each mention | |
- mention: string | |
- candidateTypes: set | |
- type2score, type2id, id2type: dictionary - like map | |
- disjointContr: list of pairs | |
- hierarchicalContr: list of pairs, each item is parent - child | |
- isLimit: boolean | |
- numLimit: integer | |
''' | |
import codecs | |
from ast import literal_eval | |
from scipy.stats import pearsonr as p | |
import time | |
from pulp import * | |
import optparse | |
import sys | |
def optimization(mention, candidateTypes, type2score, disjointContr, hierarchicalContr, pearsonValues, isLimit, numLimit, alpha): | |
prob = LpProblem("consolidation" + mention, LpMaximize) | |
T = {} | |
#variables | |
for key, v in type2score.items(): | |
T[key] = LpVariable(key, 0, 1, LpContinuous) | |
Y = {} | |
for key, v in pearsonValues.items(): | |
types = key.split(" - ") | |
t1 = types[0] | |
t2 = types[1] | |
if t1 in candidateTypes and t2 in candidateTypes: | |
Y[key] = LpVariable(key, 0, 1, LpBinary) | |
#obj function | |
obj = None | |
for key, v in type2score.items(): | |
obj += T[key] * v | |
obj *= alpha | |
tmpobj = None | |
for key, v in pearsonValues.items(): | |
types = key.split(" - ") | |
t1 = types[0] | |
t2 = types[1] | |
if t1 in candidateTypes and t2 in candidateTypes: | |
tmpobj += (1-alpha) * Y[key] * v | |
obj += tmpobj | |
prob += obj, "obj" | |
# total number of constraints | |
numCont = 0 | |
#constraints | |
if isLimit == True: | |
sum = 0 | |
for key,_ in type2score.items(): | |
sum += T[key] | |
prob += sum <= numLimit, "c1" | |
numCont = numCont + 1 | |
#disjointness | |
for t1, t2 in disjointContr: | |
if t1 in candidateTypes and t2 in candidateTypes: | |
contr = "c" + t1 + "-" + t2 | |
prob += T[t1] + T[t2] <= 1, contr | |
numCont += 1 | |
#hierarchical | |
for p, c in hierarchicalContr: | |
if p in candidateTypes and c in candidateTypes: | |
contr = "cp" + p + "-c" + c | |
prob += T[c] - T[p] <= 0, contr | |
numCont += 1 | |
#pearson constraints | |
for key, v in pearsonValues.items(): | |
types = key.split(" - ") | |
t1 = types[0] | |
t2 = types[1] | |
if t1 in candidateTypes and t2 in candidateTypes: | |
contr = "pearson" + t1 + "--" + t2 | |
prob += T[t1] + T[t2] <= 1 + Y[key], contr | |
numCont += 1 | |
# for i, t in enumerate(candidateTypes): | |
# for j, t2 in enumerate(candidateTypes): | |
# if j > i: | |
# contr = "pearson" + t + "-" + t2 | |
# prob += T[t] + T[t2] <= 1 + T[t] * T[t2], contr | |
# Solve the problem using the default solver | |
prob.solve() | |
# Use prob.solve(GLPK()) instead to choose GLPK as the solver | |
# Use GLPK(msg = 0) to suppress GLPK messages | |
# If GLPK is not in your path and you lack the pulpGLPK module, | |
# replace GLPK() with GLPK("/path/") | |
# Where /path/ is the path to glpsol (excluding glpsol itself). | |
# If you want to use CPLEX, use CPLEX() instead of GLPK(). | |
# If you want to use XPRESS, use XPRESS() instead of GLPK(). | |
# If you want to use COIN, use COIN() instead of GLPK(). In this last case, | |
# two paths may be provided (one to clp, one to cbc). | |
# Print the status of the solved LP | |
# print("Status:", LpStatus[prob.status]) | |
# Print the value of the variables at the optimum | |
# for v in prob.variables(): | |
# print(v.name, "=", v.varValue) | |
# Print the value of the objective | |
# print("objective=", value(prob.objective)) | |
return prob, numCont | |
optparser = optparse.OptionParser() | |
optparser.add_option( | |
"-b", "--basedir", default="/var/tmp/wikia/entity-typing/data-store/", | |
help="directory to model of top class prediction" | |
) | |
optparser.add_option( | |
"-u", "--universe", default="asphalt|london|peel", | |
help="reference universe" | |
) | |
optparser.add_option( | |
"-n", "--numType", default="5", | |
help="maximum number of predicted types" | |
) | |
opts = optparser.parse_args()[0] | |
basedir = opts.basedir | |
universesStr = opts.universe | |
numType = int(opts.numType) | |
start = time.time() | |
universes = universesStr.split("|") | |
print("\t\tLoading disjointness....") | |
disjointContr = [] | |
print('input-disjoint') | |
sys.stdout.flush() | |
line = sys.stdin.readline() | |
while line != 'end-disjoint': | |
t = line.strip().split("\t") | |
if (t[0], t[1]) not in disjointContr: | |
disjointContr.append((t[0], t[1])) | |
line = sys.stdin.readline() | |
line = line.strip() | |
input_ilp = [] | |
line = sys.stdin.readline() | |
while line != 'end-ilp-input': | |
input_ilp.append(line.strip()) | |
line = sys.stdin.readline() | |
line = line.strip() | |
print('\t\tget all disjoint constraints') | |
sys.stdout.flush() | |
print('\t\tget all input for ilp') | |
sys.stdout.flush() | |
###load all candidate types | |
allCandidateTypes = set() | |
for line in input_ilp: | |
lines = line.split("=====") | |
mention = lines[0] | |
if len(lines[1]) < 5: | |
continue | |
candidates = lines[1][1:(len(lines[1])-2)].split(", ") | |
for type in candidates: | |
t2s = type.split("\t") | |
allCandidateTypes.add(t2s[0]) | |
disjointGeneralContrFile = "general-class-disjoint" | |
for line in codecs.open(disjointGeneralContrFile, 'r', 'utf8'): | |
line = line.replace("\n", "") | |
t = line.split("\t") | |
if (t[0], t[1]) not in disjointContr: | |
disjointContr.append((t[0], t[1])) | |
print('\t\tLoading hierarchy...') | |
sys.stdout.flush | |
hierarchyContr = [] | |
hierarchyContr.append(('geography', 'location')) | |
for univ in universes: | |
print('\t\t' + univ) | |
sys.stdout.flush | |
hierarchyContrFile = basedir + univ + "/hierarchy" | |
for line in codecs.open(hierarchyContrFile, 'r', 'utf8'): | |
t = line.split("\t") | |
if t[0] in allCandidateTypes and t[1] in allCandidateTypes: | |
hierarchyContr.append((t[0], t[1])) | |
# print('\t\tLoading within-universe disjointness...') | |
# sys.stdout.flush | |
# | |
# for univ in universes: | |
# print('\t\t' + univ) | |
# sys.stdout.flush | |
# disjointContrFile = basedir + univ + "/disjoint" | |
# for line in codecs.open(disjointContrFile, 'r', 'utf8'): | |
# t = line.split("\t") | |
# if t[0] in allCandidateTypes and t[1] in allCandidateTypes and (t[0], t[1]) not in disjointContr: | |
# disjointContr.append((t[0], t[1])) | |
print('\t\tloading occurrence.....') | |
sys.stdout.flush() | |
pearsonValues = {} | |
big_universes = {'camphalfbloodroleplay', 'fallout', 'enmarveldatabase', | |
'forgottenrealms', 'harrypotter', 'leagueoflegends', 'lego', | |
'psychology', 'starwars', 'villains', 'wowwiki', 'xenoblade'} | |
for univ in universes: | |
print('\t\t' + univ) | |
sys.stdout.flush() | |
if univ in big_universes: | |
continue; | |
occurrenceFile = basedir + univ + "/type-occurrence" | |
occurrences = {} | |
allTypes = [] | |
for line in codecs.open(occurrenceFile, 'r', 'utf8'): | |
t = line.split("\t") | |
if t[0] in allCandidateTypes: | |
allTypes.append(t[0]) | |
occurrences[t[0]] = literal_eval(t[1]) | |
for i, val in enumerate(allTypes): | |
for j, valj in enumerate(allTypes): | |
if j > i: | |
key = val + " - " + valj | |
try: | |
with warnings.catch_warnings(): | |
warnings.simplefilter("error", category=RuntimeWarning) | |
score, _ = p(occurrences[val], occurrences[valj]) | |
except: | |
continue | |
pearsonValues[key] = score | |
# print(key + "\t" + str(score)) | |
end = time.time() | |
print("\t\tLoading constraints....") | |
sys.stdout.flush | |
print('\t\t' + str(end - start)) | |
# disjointContr.append(("characters", "location")) | |
results = {} | |
numInst = 0 | |
numContAvg = 0 | |
for line in input_ilp: | |
lines = line.split("=====") | |
mention = lines[0] | |
if len(lines[1]) < 5: | |
continue | |
candidates = lines[1][1:(len(lines[1])-2)].split(", ") | |
candidateTypes = set() | |
type2score = {} | |
for type in candidates: | |
t2s = type.split("\t") | |
candidateTypes.add(t2s[0]) | |
type2score[t2s[0]] = float(t2s[1]) | |
prob, numContr = optimization(mention, candidateTypes, type2score, disjointContr, hierarchyContr, pearsonValues, True, numType, 0.5) | |
results[mention] = prob | |
numInst += 1 | |
numContAvg += numContr | |
print('\t\tfishing ilp computation...') | |
sys.stdout.flush() | |
print('\t\tAvg. of number of constraints:\t' + str(numContAvg/numInst)) | |
print('ilp-results') | |
sys.stdout.flush() | |
for mention in results.keys(): | |
resStr = '' | |
resStr += mention + "=====[" | |
res = [] | |
for v in results[mention].variables(): | |
if v.varValue >= 0.5 and "___" not in v.name: | |
res.append(v.name + "\t" + str(v.varValue)) | |
resStr += ", ".join([t for t in res]) | |
resStr += "]" | |
print(resStr) | |
sys.stdout.flush() | |
print('end-ilp') | |
sys.stdout.flush() | |
end2 = time.time() | |
print("ILP solver...") | |
print(end2 - end) | |