Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
141 lines (127 sloc) 5.37 KB
package kb.howtokb.clustering;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Map;
import java.util.Set;
import edu.stanford.nlp.util.Pair;
import kb.howtokb.clustering.basicobj.ActivityWordsCategory;
import kb.howtokb.clustering.sim.CategorySimilarity;
import kb.howtokb.clustering.sim.Coefficient;
import kb.howtokb.clustering.sim.w2v.Word2VecSimilarity;
import kb.howtokb.utils.BijectiveMap;
import kb.howtokb.utils.FileLines;
import kb.howtokb.utils.IDMap;
public class SimplePruningSimilarity implements ISimilarity<ActivityWordsCategory>{
private static CategorySimilarity cate;
private double threshold;
private String model;
private String allAct;
private Set<String> allActList;
private ActivityCachedSim aSim;
public SimplePruningSimilarity(double threshold, String model, String allAct) throws SQLException, IOException, ClassNotFoundException {
cate = new CategorySimilarity();
this.threshold = threshold;
this.model = model;
this.allAct = allAct;
ISimilarity<String> word2vecSim = new Word2VecSimilarity(this.model, 0, false);
IDMap<String, Integer> vIDs = new IDMap<>(0);
IDMap<String, Integer> nIDs = new IDMap<>(0);
BijectiveMap<String, Pair<Integer, Integer>> aIDs = new BijectiveMap<>();
loadIDs(this.allAct, vIDs, nIDs, aIDs);
//Using separate threshold for verb and noun v: 0.747, n: 0.67
//Or can use same threshold, the bottom neck: vvnn: 0.5
ActivityComponentSim vSim = new ActivityComponentSim(Coefficient.V_THRES, vIDs, word2vecSim);
ActivityComponentSim nSim = new ActivityComponentSim(Coefficient.O_THRES, nIDs, word2vecSim);
this.aSim = new ActivityCachedSim(aIDs, vSim, nSim);
}
public SimplePruningSimilarity(double threshold, String model, Set<String> allAct) throws SQLException, IOException, ClassNotFoundException {
cate = new CategorySimilarity();
this.threshold = threshold;
this.model = model;
this.allActList = allAct;
ISimilarity<String> word2vecSim = new Word2VecSimilarity(this.model, 0, false);
IDMap<String, Integer> vIDs = new IDMap<>(0);
IDMap<String, Integer> nIDs = new IDMap<>(0);
BijectiveMap<String, Pair<Integer, Integer>> aIDs = new BijectiveMap<>();
loadIDs(this.allActList, vIDs, nIDs, aIDs);
//Using separate threshold for verb and noun v: 0.747, n: 0.67
//Or can use same threshold, the bottom neck: vvnn: 0.5
ActivityComponentSim vSim = new ActivityComponentSim(Coefficient.V_THRES, vIDs, word2vecSim);
ActivityComponentSim nSim = new ActivityComponentSim(Coefficient.O_THRES, nIDs, word2vecSim);
this.aSim = new ActivityCachedSim(aIDs, vSim, nSim);
}
@Override
public double sim(ActivityWordsCategory e1, ActivityWordsCategory e2) {
// TODO we don't have function to compute in this case
return 0;
}
@Override
public Map<ActivityWordsCategory, Double> getNeighbors(ActivityWordsCategory e) {
// TODO We don't need this function right now
return null;
}
/**
* Check similar between two activities in a simple way
* if similarity between two category is less than a threshold
* and similarity between two strong activities is less than a threshold
* then they are dissimilar, otw not sure and return true
* @param two ActivityWordsCategory objects
* @return true/false
*/
@Override
public boolean simThreshold(ActivityWordsCategory e1, ActivityWordsCategory e2, double minthreshold) throws Exception {
if (!cate.isSim(e1.getCatID(), e2.getCatID()))
if (!aSim.sim(e1.getActivityStrong(), e2.getActivityStrong()))
return false;
return true;
}
/**
* loadID from a file including a list of activities
* @param activityList
* @param vIDs
* @param nIDs
* @param aIDs
* @throws FileNotFoundException
*/
private static void loadIDs(String activityList, IDMap<String, Integer> vIDs, IDMap<String, Integer> nIDs,
BijectiveMap<String, Pair<Integer, Integer>> aIDs) throws FileNotFoundException {
for (String a : new FileLines(activityList)) {
// return,from,work
String[] vn = a.split(";");
String verb = "v_" + (vn[0].contains(" ")?vn[0].split(" ")[0]:vn[0]);
String noun = "n_" + (vn[1].contains(" ")?vn[1].split(" ")[vn[1].split(" ").length - 1]:vn[1]);
int vid = vIDs.getAvailableGlobalID();
vIDs.add(verb);
int nid = nIDs.getAvailableGlobalID();
nIDs.add(noun);
aIDs.put(a, new Pair<Integer, Integer>(vid, nid));
}
System.out.println("Load map verb to id done! Size: " + vIDs.size());
System.out.println("Load map noun to id done! Size: " + nIDs.size());
}
/**
* loadID from list of activities
* @param activityList
* @param vIDs
* @param nIDs
* @param aIDs
* @throws FileNotFoundException
*/
private static void loadIDs(Set<String> activityList, IDMap<String, Integer> vIDs, IDMap<String, Integer> nIDs,
BijectiveMap<String, Pair<Integer, Integer>> aIDs) throws FileNotFoundException {
for (String a : activityList) {
// return,from,work
String[] vn = a.split(";");
String verb = "v_" + (vn[0].contains(" ")?vn[0].split(" ")[0]:vn[0]);
String noun = "n_" + (vn[1].contains(" ")?vn[1].split(" ")[vn[1].split(" ").length - 1]:vn[1]);
int vid = vIDs.getAvailableGlobalID();
vIDs.add(verb);
int nid = nIDs.getAvailableGlobalID();
nIDs.add(noun);
aIDs.put(a, new Pair<Integer, Integer>(vid, nid));
}
System.out.println("Load map verb to id done! Size: " + vIDs.size());
System.out.println("Load map noun to id done! Size: " + nIDs.size());
}
}