Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/kb/howtokb/clustering/sim/CategorySimilarity.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
175 lines (145 sloc)
4.84 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.clustering.sim; | |
import java.io.IOException; | |
import java.sql.ResultSet; | |
import java.sql.SQLException; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import edu.stanford.nlp.util.Pair; | |
import kb.howtokb.utils.AutoMap; | |
import kb.howtokb.utils.SQLiteJDBCConnector; | |
import kb.howtokb.wkhobject.Category_Json; | |
public class CategorySimilarity { | |
/** | |
* Compute similarity between two categories using | |
* sim (c1, c2) = (2*height(lca(c1,c2)) + 1)/(height(c1) + height(c2) + 1) | |
* with additive smoothing | |
* | |
*/ | |
private static Map<Integer, List<Integer>> parentChains; | |
private static Map<Pair<Integer, Integer>, Double> preCate; | |
private static List<Integer> allCate; | |
private static double threshold = Coefficient.CATE_THRES; | |
public CategorySimilarity() throws SQLException, ClassNotFoundException, IOException{ | |
loadParentChains(); | |
loadAllCate(); | |
preComputedCate(); | |
} | |
//Compute similarity between two categories | |
public double simWUP(int c1, int c2) { | |
int c0 = firstCommAncestor(c1, c2); | |
double pc0 = path2root(c0).size() -1; | |
double pc1 = path2root(c1).size() -1; | |
double pc2 = path2root(c2).size() -1; | |
//System.out.println(pc0 + ", " + pc1 + ", " + pc2); | |
return (2.0 * pc0 + 1) / (pc1 + pc2 + 1); | |
} | |
//Compute similarity between two categories | |
public double sim(int c1, int c2) { | |
try{ | |
Pair<Integer, Integer> pair = new Pair<Integer, Integer>(c1, c2); | |
Pair<Integer, Integer> ipair = new Pair<Integer, Integer>(c2, c1); | |
if (preCate.containsKey(pair)) | |
return preCate.get(pair); | |
return preCate.get(ipair); | |
}catch(Exception e){ | |
return simWUP(c1,c2); | |
} | |
} | |
/** | |
* Check if similarity between two categories is greater than a threshold | |
* @return | |
*/ | |
public boolean isSim(int c1, int c2){ | |
Pair<Integer, Integer> pair = new Pair<Integer, Integer>(c1, c2); | |
if (preCate.containsKey(pair)) | |
return true; | |
return preCate.containsKey(new Pair<Integer, Integer>(c2,c1)); | |
} | |
public Map<Pair<Integer, Integer>, Double> getPreCate() { | |
return preCate; | |
} | |
private void loadParentChains() throws SQLException, ClassNotFoundException, IOException { | |
parentChains = new AutoMap<>(); | |
// "rootpath":[57,54,52,150,1] | |
ResultSet rs = SQLiteJDBCConnector.q("select id, json from categoryjson"); | |
while (rs.next()) { | |
try { | |
parentChains.put(rs.getInt(1), Category_Json.fromJson(rs.getString(2)).getRootpath()); | |
} catch (Exception e) { | |
System.out.print("\n---- JSONException in category: " + rs.getInt(1)); | |
} | |
} | |
} | |
private void loadAllCate(){ | |
allCate = new ArrayList<>(); | |
for (Entry<Integer, List<Integer>> e: parentChains.entrySet()){ | |
allCate.add(e.getKey()); | |
} | |
System.out.println("Number of cates: " + allCate.size()); | |
} | |
private void preComputedCate(){ | |
System.out.println("Pre-Computing similarity between two categories....."); | |
preCate = new HashMap<>(); | |
for (int i=0; i<allCate.size(); i++){ | |
for (int j=i; j<allCate.size(); j++){ | |
int c1 = allCate.get(i); | |
int c2 = allCate.get(j); | |
double sim = simWUP(c1, c2); | |
if (sim >= threshold){ | |
preCate.put(new Pair<Integer, Integer>(c1, c2), sim); | |
} | |
} | |
} | |
System.out.println("Done! Number of pair: " + preCate.size()); | |
} | |
//Get lowest common ancestor | |
private int firstCommAncestor(int c1, int c2) { | |
List<Integer> r1 = path2root(c1); | |
List<Integer> r2 = path2root(c2); | |
// first common ancestor | |
for (int i : r1) | |
if (r2.contains(i)) | |
return i; | |
return 1; // the root. | |
} | |
@SuppressWarnings("unchecked") | |
private List<Integer> path2root(int c1) { | |
return parentChains.containsKey(c1) ? parentChains.get(c1) : Collections.EMPTY_LIST; | |
} | |
/*private static void testSimCat(CategorySimilarity s) { | |
String input = ""; | |
try { | |
while (!input.equals("q")) { | |
input = Util.readStringFromUser("\nc1,c2:"); | |
String[] pair = input.replace(" ", "").split(","); | |
if (pair == null || pair.length < 2) { | |
if (!input.equals("q")) | |
System.out.println("-- wrong input -- e.g. 12,28"); | |
continue; | |
} | |
System.out.println(Util.format(s.sim(Integer.parseInt(pair[0]), Integer.parseInt(pair[1])))); | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} finally { | |
DBConnector.closeConnections(); | |
} | |
}*/ | |
/*public static void main(String[] args) throws Exception { | |
CategorySimilarity cs = new CategorySimilarity(); | |
//testSimCat(cs); | |
Writer cateout = new BufferedWriter(new OutputStreamWriter( | |
new FileOutputStream("/var/tmp/cxchu/clustering-pre-computation/preCate.txt"), "utf-8")); | |
//Write preCate to file | |
Map<Pair<Integer, Integer>, Double> preCate = cs.getPreCate(); | |
for (Entry<Pair<Integer,Integer>, Double> e: preCate.entrySet()){ | |
cateout.write(e.getKey().first + ";" + e.getKey().second | |
+ "\t" + Util.format(e.getValue()) + "\n"); | |
} | |
cateout.close(); | |
}*/ | |
} |