Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
312 lines (239 sloc) 12.2 KB
package kb.howtokb.clustering.sim;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.tools.InformationExtraction;
import kb.howtokb.tools.NormalizationText;
import kb.howtokb.tools.StructureConverter;
public class SimilarityComputation {
//Similarity between two category
public static double simCategory(CategorySimilarity cs, int c1, int c2){
//There are a few activity frame having new category, the cateID = -1
//We transfer to GENERAL_CATEGORY
c1 = c1<1?1:c1; c2 = c2<1?1:c2;
return Double.parseDouble(NormalizationText.format(cs.sim(c1, c2)));
}
//similarity between two words: verb
public static double simVerb(String word1, String word2) throws NumberFormatException, IOException, SQLException{
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfVerb(word1, word2)));
}
//similarity between two words: object
public static double simNoun(String word1, String word2) throws NumberFormatException, IOException, SQLException{
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfNoun(word1, word2)));
}
//similarity between two activities using word2vec
public static double simActW2V(String word1, String word2) throws Exception{
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfPairW2V(word1, word2)));
}
//similarity between two list of word (don't have to pre-process data): agents
//jaccard
public static double simList(List<String> l1, List<String> l2) throws NumberFormatException, IOException, SQLException{
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfListWord(l1, l2)));
}
//similarity between two list of words: location, time, objects (have to pre-process data: pick head word)
//jaccard
public static double simLocationTimeAndObject(List<String> l1, List<String> l2) throws SQLException, IOException{
l1 = NormalizationText.normList(l1);
l2 = NormalizationText.normList(l2);
return Double.parseDouble(NormalizationText.format(simList(l1, l2)));
}
//similarity between two list of activity surface
//jaccard
public static double simActSurfaceList(List<String> l1, List<String> l2) throws Exception{
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfListActivity(l1, l2)));
}
//similarity between two phrases: ori_object, ori-verb
//Until now, using jaccard
public static double simPhrase(String s1, String s2) throws NumberFormatException, SQLException, IOException{
return Double.parseDouble(NormalizationText.format(Double.parseDouble(NormalizationText.format(StringSimilarity.simOfPhrase(s1, s2)))));
}
//========================Old part, with jaccard have threshold and lookup db
/*//similarity between two list of word (don't have to pre-process data): agents
//jaccard
public static double simList(List<String> l1, List<String> l2, double threshold) throws NumberFormatException, IOException, SQLException{
return Double.parseDouble(Util.format(StringSimilarity.simOfListNoun(l1, l2, threshold)));
}
//similarity between two list of words: location, time, objects (have to pre-process data: pick head word)
//jaccard
public static double simLocationTimeAndObject(List<String> l1, List<String> l2, double threshold) throws SQLException, IOException{
l1 = NormalizationText.normList(l1);
l2 = NormalizationText.normList(l2);
return Double.parseDouble(Util.format(simList(l1, l2, threshold)));
}
//similarity between two phrases: ori_object
//Until now, using jaccard
public static double simNounPhrase(String s1, String s2, double threshold) throws NumberFormatException, SQLException, IOException{
return Double.parseDouble(Util.format(Double.parseDouble(Util.format(StringSimilarity.simOfNounPhrase(s1, s2, threshold)))));
}
//similarity between two phrases: ori_verb
//Until now, using jaccard
public static double simVerbPhrase(String s1, String s2, double threshold) throws NumberFormatException, SQLException, IOException{
return Double.parseDouble(Util.format(Double.parseDouble(Util.format(StringSimilarity.simOfVerbPhrase(s1, s2, threshold)))));
}*/
//===========================================================
//similarity vector between two activities
public static double[] getSimilarVector(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{
double[] res = new double[10];
String v1 = f1.getActivity().getVerb();
String v2 = f2.getActivity().getVerb();
double v2v = simVerb(v1, v2);
res[0] = v2v;
String n1 = f1.getActivity().getObject();
String n2 = f2.getActivity().getObject();
double o2o = simNoun(n1, n2);
res[1] = o2o;
double c2c = simCategory(cs,Integer.parseInt(f1.getActivity().getCategoryID()),
Integer.parseInt(f2.getActivity().getCategoryID()));
res[2] = c2c;
double l2l = simLocationTimeAndObject(f1.getLocations(), f2.getLocations());
res[3] = l2l;
double t2t = simLocationTimeAndObject(f1.getTemporal(), f2.getTemporal());
res[4] = t2t;
double parto2parto = simLocationTimeAndObject(f1.getParticipatingObject(), f2.getParticipatingObject());
res[5] = parto2parto;
double parta2parta = simList(f1.getParticipatingAgent(), f2.getParticipatingAgent());
res[6] = parta2parta;
double ov2ov = simPhrase(f1.getActivity().getOriVerb(), f2.getActivity().getOriObject());
res[7] = ov2ov;
double oo2oo = simPhrase(f1.getActivity().getOriObject(), f2.getActivity().getOriObject());
res[8] = oo2oo;
//v1*v2*n1*n2
String a1 = f1.getActivity().getVerb() + ";"+ f1.getActivity().getObject();
String a2 = f2.getActivity().getVerb() + ";" + f2.getActivity().getObject();
double a1a2 = simActW2V(a1, a2);
res[9] = a1a2;
return res;
}
//similarity vector between two activities
//Include context: parent, sub, prev/next
public static double[] getFullSimilarVector(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{
double[] res = new double[14];
String v1 = f1.getActivity().getVerb();
String v2 = f2.getActivity().getVerb();
double v2v = simVerb(v1, v2);
res[0] = v2v;
String n1 = f1.getActivity().getObject();
String n2 = f2.getActivity().getObject();
double o2o = simNoun(n1, n2);
res[1] = o2o;
double c2c = simCategory(cs,Integer.parseInt(f1.getActivity().getCategoryID()),
Integer.parseInt(f2.getActivity().getCategoryID()));
res[2] = c2c;
double l2l = simLocationTimeAndObject(f1.getLocations(), f2.getLocations());
res[3] = l2l;
double t2t = simLocationTimeAndObject(f1.getTemporal(), f2.getTemporal());
res[4] = t2t;
double parto2parto = simLocationTimeAndObject(f1.getParticipatingObject(), f2.getParticipatingObject());
res[5] = parto2parto;
double parta2parta = simList(f1.getParticipatingAgent(), f2.getParticipatingAgent());
res[6] = parta2parta;
double ov2ov = simPhrase(f1.getActivity().getOriVerb(), f2.getActivity().getOriObject());
res[7] = ov2ov;
double oo2oo = simPhrase(f1.getActivity().getOriObject(), f2.getActivity().getOriObject());
res[8] = oo2oo;
//v1*v2*n1*n2
String a1 = f1.getActivity().getVerb() + ";"+ f1.getActivity().getObject();
String a2 = f2.getActivity().getVerb() + ";" + f2.getActivity().getObject();
double a1a2 = simActW2V(a1, a2);
res[9] = a1a2;
//How about parent, prev/next, sub-act
List<String> parent1 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f1.getActivity().getParent()));
List<String> parent2 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f2.getActivity().getParent()));
double parent = simActSurfaceList(parent1, parent2);
res[10] = parent;
List<String> prev1 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f1.getActivity().getPrev()));
List<String> prev2 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f2.getActivity().getPrev()));
double prev = simActSurfaceList(prev1, prev2);
res[11] = prev;
List<String> next1 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f1.getActivity().getNext()));
List<String> next2 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f2.getActivity().getNext()));
double next = simActSurfaceList(next1, next2);
res[12] = next;
List<String> sub1 = InformationExtraction.getListofActivitySurfaceFromDb(f1.getActivity().getSubActivities());
List<String> sub2 = InformationExtraction.getListofActivitySurfaceFromDb(f2.getActivity().getSubActivities());
double sub = simActSurfaceList(sub1, sub2);
res[13] = sub;
return res;
}
//Get the final similarity value between two activity frames
public static double getSimilarity(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{
double [] vector = getSimilarVector(cs, f1, f2);
double res = 0;
for (int i=0; i<vector.length; i++){
res += vector[i] * Coefficient.ALL_COEF[i];
}
res += Coefficient.INTERCEPT;
res = 1/(1 + Math.exp(-res));
return res;
}
//Get the final similarity value between two activity frames
public static double getFullSimilarity(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{
double [] vector = getFullSimilarVector(cs, f1, f2);
double res = 0;
for (int i=0; i<vector.length; i++){
res += vector[i] * Coefficient.FULL_COEF[i];
}
res += Coefficient.INTERCEPT_FULL;
res = 1/(1 + Math.exp(-res));
return res;
}
//========================Old part, with jaccard have threshold and lookup db
/*//similarity vector between two activities
public static double[] getSimilarVector(CategorySimilarity cs, ActivityFrame f1, ActivityFrame f2, double threshold) throws Exception{
double[] res = new double[10];
String v1 = f1.getActivity().getVerb();
v1 = v1.contains(" ")?v1.split(" ")[0]:v1;
String v2 = f2.getActivity().getVerb();
v2 = v2.contains(" ")?v2.split(" ")[0]:v2;
double v2v = simVerb(v1, v2);
res[0] = v2v;
String n1 = f1.getActivity().getObject();
n1 = n1.contains(" ")?n1.split(" ")[n1.split(" ").length - 1]:n1;
String n2 = f2.getActivity().getObject();
n2 = n2.contains(" ")?n2.split(" ")[n2.split(" ").length - 1]:n2;
double o2o = simNoun(n1, n2);
res[1] = o2o;
double c2c = simCategory(cs,Integer.parseInt(f1.getActivity().getCategoryID()),
Integer.parseInt(f2.getActivity().getCategoryID()));
res[2] = c2c;
double l2l = simLocationTimeAndObject(f1.getLocations(), f2.getLocations(), threshold);
res[3] = l2l;
double t2t = simLocationTimeAndObject(f1.getTemporal(), f2.getTemporal(), threshold);
res[4] = t2t;
double parto2parto = simLocationTimeAndObject(f1.getParticipatingObject(), f2.getParticipatingObject(), threshold);
res[5] = parto2parto;
double parta2parta = simList(f1.getParticipatingAgent(), f2.getParticipatingAgent(), threshold);
res[6] = parta2parta;
double ov2ov = simVerbPhrase(f1.getActivity().getOriVerb(), f2.getActivity().getOriObject(), threshold);
res[7] = ov2ov;
double oo2oo = simNounPhrase(f1.getActivity().getOriObject(), f2.getActivity().getOriObject(), threshold);
res[8] = oo2oo;
//v1*v2*n1*n2
String a1 = f1.getActivity().getVerb() + ";"+ f1.getActivity().getObject();
String a2 = f2.getActivity().getVerb() + ";" + f2.getActivity().getObject();
double a1a2 = simActW2V(a1, a2);
res[9] = a1a2;
return res;
}*/
/*public static void main(String[] args) throws Exception {
CategorySimilarity cs = new CategorySimilarity();
//test category
System.out.println("Similarity of categories: " + simCategory(cs, 150, 150));
//test word
String word1 = "set";
String word2 = "dinner";
System.out.println("Similarity of '" + word1 + "' and '"
+ word2 + "': " + simWord(word1, word2));
//test list
List<String> l1 = new ArrayList<>();
l1.add("book"); l1.add("dictionary"); l1.add("laptop");
List<String> l2 = new ArrayList<>();
l2.add("book"); l2.add("bottle"); l2.add("computer"); l2.add("diary");
System.out.println("Similarity of two lists of string: " + simList(l1, l2, 0.7));
//Test sim of two string
String s1 = "play football with friends";
String s2 = "paint room with friends";
System.out.println(simPhrase(s1, s2, 0.7));
}*/
}