Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
175 lines (145 sloc) 4.84 KB
package kb.howtokb.clustering.sim;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import edu.stanford.nlp.util.Pair;
import kb.howtokb.utils.AutoMap;
import kb.howtokb.utils.SQLiteJDBCConnector;
import kb.howtokb.wkhobject.Category_Json;
public class CategorySimilarity {
/**
* Compute similarity between two categories using
* sim (c1, c2) = (2*height(lca(c1,c2)) + 1)/(height(c1) + height(c2) + 1)
* with additive smoothing
*
*/
private static Map<Integer, List<Integer>> parentChains;
private static Map<Pair<Integer, Integer>, Double> preCate;
private static List<Integer> allCate;
private static double threshold = Coefficient.CATE_THRES;
public CategorySimilarity() throws SQLException, ClassNotFoundException, IOException{
loadParentChains();
loadAllCate();
preComputedCate();
}
//Compute similarity between two categories
public double simWUP(int c1, int c2) {
int c0 = firstCommAncestor(c1, c2);
double pc0 = path2root(c0).size() -1;
double pc1 = path2root(c1).size() -1;
double pc2 = path2root(c2).size() -1;
//System.out.println(pc0 + ", " + pc1 + ", " + pc2);
return (2.0 * pc0 + 1) / (pc1 + pc2 + 1);
}
//Compute similarity between two categories
public double sim(int c1, int c2) {
try{
Pair<Integer, Integer> pair = new Pair<Integer, Integer>(c1, c2);
Pair<Integer, Integer> ipair = new Pair<Integer, Integer>(c2, c1);
if (preCate.containsKey(pair))
return preCate.get(pair);
return preCate.get(ipair);
}catch(Exception e){
return simWUP(c1,c2);
}
}
/**
* Check if similarity between two categories is greater than a threshold
* @return
*/
public boolean isSim(int c1, int c2){
Pair<Integer, Integer> pair = new Pair<Integer, Integer>(c1, c2);
if (preCate.containsKey(pair))
return true;
return preCate.containsKey(new Pair<Integer, Integer>(c2,c1));
}
public Map<Pair<Integer, Integer>, Double> getPreCate() {
return preCate;
}
private void loadParentChains() throws SQLException, ClassNotFoundException, IOException {
parentChains = new AutoMap<>();
// "rootpath":[57,54,52,150,1]
ResultSet rs = SQLiteJDBCConnector.q("select id, json from categoryjson");
while (rs.next()) {
try {
parentChains.put(rs.getInt(1), Category_Json.fromJson(rs.getString(2)).getRootpath());
} catch (Exception e) {
System.out.print("\n---- JSONException in category: " + rs.getInt(1));
}
}
}
private void loadAllCate(){
allCate = new ArrayList<>();
for (Entry<Integer, List<Integer>> e: parentChains.entrySet()){
allCate.add(e.getKey());
}
System.out.println("Number of cates: " + allCate.size());
}
private void preComputedCate(){
System.out.println("Pre-Computing similarity between two categories.....");
preCate = new HashMap<>();
for (int i=0; i<allCate.size(); i++){
for (int j=i; j<allCate.size(); j++){
int c1 = allCate.get(i);
int c2 = allCate.get(j);
double sim = simWUP(c1, c2);
if (sim >= threshold){
preCate.put(new Pair<Integer, Integer>(c1, c2), sim);
}
}
}
System.out.println("Done! Number of pair: " + preCate.size());
}
//Get lowest common ancestor
private int firstCommAncestor(int c1, int c2) {
List<Integer> r1 = path2root(c1);
List<Integer> r2 = path2root(c2);
// first common ancestor
for (int i : r1)
if (r2.contains(i))
return i;
return 1; // the root.
}
@SuppressWarnings("unchecked")
private List<Integer> path2root(int c1) {
return parentChains.containsKey(c1) ? parentChains.get(c1) : Collections.EMPTY_LIST;
}
/*private static void testSimCat(CategorySimilarity s) {
String input = "";
try {
while (!input.equals("q")) {
input = Util.readStringFromUser("\nc1,c2:");
String[] pair = input.replace(" ", "").split(",");
if (pair == null || pair.length < 2) {
if (!input.equals("q"))
System.out.println("-- wrong input -- e.g. 12,28");
continue;
}
System.out.println(Util.format(s.sim(Integer.parseInt(pair[0]), Integer.parseInt(pair[1]))));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
DBConnector.closeConnections();
}
}*/
/*public static void main(String[] args) throws Exception {
CategorySimilarity cs = new CategorySimilarity();
//testSimCat(cs);
Writer cateout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/clustering-pre-computation/preCate.txt"), "utf-8"));
//Write preCate to file
Map<Pair<Integer, Integer>, Double> preCate = cs.getPreCate();
for (Entry<Pair<Integer,Integer>, Double> e: preCate.entrySet()){
cateout.write(e.getKey().first + ";" + e.getKey().second
+ "\t" + Util.format(e.getValue()) + "\n");
}
cateout.close();
}*/
}