Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/kb/howtokb/clustering/sim/SimilarityComputation.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
312 lines (239 sloc)
12.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.clustering.sim; | |
import java.io.IOException; | |
import java.sql.SQLException; | |
import java.util.List; | |
import kb.howtokb.taskframe.WikiHowTaskFrame; | |
import kb.howtokb.tools.InformationExtraction; | |
import kb.howtokb.tools.NormalizationText; | |
import kb.howtokb.tools.StructureConverter; | |
public class SimilarityComputation { | |
//Similarity between two category | |
public static double simCategory(CategorySimilarity cs, int c1, int c2){ | |
//There are a few activity frame having new category, the cateID = -1 | |
//We transfer to GENERAL_CATEGORY | |
c1 = c1<1?1:c1; c2 = c2<1?1:c2; | |
return Double.parseDouble(NormalizationText.format(cs.sim(c1, c2))); | |
} | |
//similarity between two words: verb | |
public static double simVerb(String word1, String word2) throws NumberFormatException, IOException, SQLException{ | |
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfVerb(word1, word2))); | |
} | |
//similarity between two words: object | |
public static double simNoun(String word1, String word2) throws NumberFormatException, IOException, SQLException{ | |
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfNoun(word1, word2))); | |
} | |
//similarity between two activities using word2vec | |
public static double simActW2V(String word1, String word2) throws Exception{ | |
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfPairW2V(word1, word2))); | |
} | |
//similarity between two list of word (don't have to pre-process data): agents | |
//jaccard | |
public static double simList(List<String> l1, List<String> l2) throws NumberFormatException, IOException, SQLException{ | |
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfListWord(l1, l2))); | |
} | |
//similarity between two list of words: location, time, objects (have to pre-process data: pick head word) | |
//jaccard | |
public static double simLocationTimeAndObject(List<String> l1, List<String> l2) throws SQLException, IOException{ | |
l1 = NormalizationText.normList(l1); | |
l2 = NormalizationText.normList(l2); | |
return Double.parseDouble(NormalizationText.format(simList(l1, l2))); | |
} | |
//similarity between two list of activity surface | |
//jaccard | |
public static double simActSurfaceList(List<String> l1, List<String> l2) throws Exception{ | |
return Double.parseDouble(NormalizationText.format(StringSimilarity.simOfListActivity(l1, l2))); | |
} | |
//similarity between two phrases: ori_object, ori-verb | |
//Until now, using jaccard | |
public static double simPhrase(String s1, String s2) throws NumberFormatException, SQLException, IOException{ | |
return Double.parseDouble(NormalizationText.format(Double.parseDouble(NormalizationText.format(StringSimilarity.simOfPhrase(s1, s2))))); | |
} | |
//========================Old part, with jaccard have threshold and lookup db | |
/*//similarity between two list of word (don't have to pre-process data): agents | |
//jaccard | |
public static double simList(List<String> l1, List<String> l2, double threshold) throws NumberFormatException, IOException, SQLException{ | |
return Double.parseDouble(Util.format(StringSimilarity.simOfListNoun(l1, l2, threshold))); | |
} | |
//similarity between two list of words: location, time, objects (have to pre-process data: pick head word) | |
//jaccard | |
public static double simLocationTimeAndObject(List<String> l1, List<String> l2, double threshold) throws SQLException, IOException{ | |
l1 = NormalizationText.normList(l1); | |
l2 = NormalizationText.normList(l2); | |
return Double.parseDouble(Util.format(simList(l1, l2, threshold))); | |
} | |
//similarity between two phrases: ori_object | |
//Until now, using jaccard | |
public static double simNounPhrase(String s1, String s2, double threshold) throws NumberFormatException, SQLException, IOException{ | |
return Double.parseDouble(Util.format(Double.parseDouble(Util.format(StringSimilarity.simOfNounPhrase(s1, s2, threshold))))); | |
} | |
//similarity between two phrases: ori_verb | |
//Until now, using jaccard | |
public static double simVerbPhrase(String s1, String s2, double threshold) throws NumberFormatException, SQLException, IOException{ | |
return Double.parseDouble(Util.format(Double.parseDouble(Util.format(StringSimilarity.simOfVerbPhrase(s1, s2, threshold))))); | |
}*/ | |
//=========================================================== | |
//similarity vector between two activities | |
public static double[] getSimilarVector(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{ | |
double[] res = new double[10]; | |
String v1 = f1.getActivity().getVerb(); | |
String v2 = f2.getActivity().getVerb(); | |
double v2v = simVerb(v1, v2); | |
res[0] = v2v; | |
String n1 = f1.getActivity().getObject(); | |
String n2 = f2.getActivity().getObject(); | |
double o2o = simNoun(n1, n2); | |
res[1] = o2o; | |
double c2c = simCategory(cs,Integer.parseInt(f1.getActivity().getCategoryID()), | |
Integer.parseInt(f2.getActivity().getCategoryID())); | |
res[2] = c2c; | |
double l2l = simLocationTimeAndObject(f1.getLocations(), f2.getLocations()); | |
res[3] = l2l; | |
double t2t = simLocationTimeAndObject(f1.getTemporal(), f2.getTemporal()); | |
res[4] = t2t; | |
double parto2parto = simLocationTimeAndObject(f1.getParticipatingObject(), f2.getParticipatingObject()); | |
res[5] = parto2parto; | |
double parta2parta = simList(f1.getParticipatingAgent(), f2.getParticipatingAgent()); | |
res[6] = parta2parta; | |
double ov2ov = simPhrase(f1.getActivity().getOriVerb(), f2.getActivity().getOriObject()); | |
res[7] = ov2ov; | |
double oo2oo = simPhrase(f1.getActivity().getOriObject(), f2.getActivity().getOriObject()); | |
res[8] = oo2oo; | |
//v1*v2*n1*n2 | |
String a1 = f1.getActivity().getVerb() + ";"+ f1.getActivity().getObject(); | |
String a2 = f2.getActivity().getVerb() + ";" + f2.getActivity().getObject(); | |
double a1a2 = simActW2V(a1, a2); | |
res[9] = a1a2; | |
return res; | |
} | |
//similarity vector between two activities | |
//Include context: parent, sub, prev/next | |
public static double[] getFullSimilarVector(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{ | |
double[] res = new double[14]; | |
String v1 = f1.getActivity().getVerb(); | |
String v2 = f2.getActivity().getVerb(); | |
double v2v = simVerb(v1, v2); | |
res[0] = v2v; | |
String n1 = f1.getActivity().getObject(); | |
String n2 = f2.getActivity().getObject(); | |
double o2o = simNoun(n1, n2); | |
res[1] = o2o; | |
double c2c = simCategory(cs,Integer.parseInt(f1.getActivity().getCategoryID()), | |
Integer.parseInt(f2.getActivity().getCategoryID())); | |
res[2] = c2c; | |
double l2l = simLocationTimeAndObject(f1.getLocations(), f2.getLocations()); | |
res[3] = l2l; | |
double t2t = simLocationTimeAndObject(f1.getTemporal(), f2.getTemporal()); | |
res[4] = t2t; | |
double parto2parto = simLocationTimeAndObject(f1.getParticipatingObject(), f2.getParticipatingObject()); | |
res[5] = parto2parto; | |
double parta2parta = simList(f1.getParticipatingAgent(), f2.getParticipatingAgent()); | |
res[6] = parta2parta; | |
double ov2ov = simPhrase(f1.getActivity().getOriVerb(), f2.getActivity().getOriObject()); | |
res[7] = ov2ov; | |
double oo2oo = simPhrase(f1.getActivity().getOriObject(), f2.getActivity().getOriObject()); | |
res[8] = oo2oo; | |
//v1*v2*n1*n2 | |
String a1 = f1.getActivity().getVerb() + ";"+ f1.getActivity().getObject(); | |
String a2 = f2.getActivity().getVerb() + ";" + f2.getActivity().getObject(); | |
double a1a2 = simActW2V(a1, a2); | |
res[9] = a1a2; | |
//How about parent, prev/next, sub-act | |
List<String> parent1 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f1.getActivity().getParent())); | |
List<String> parent2 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f2.getActivity().getParent())); | |
double parent = simActSurfaceList(parent1, parent2); | |
res[10] = parent; | |
List<String> prev1 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f1.getActivity().getPrev())); | |
List<String> prev2 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f2.getActivity().getPrev())); | |
double prev = simActSurfaceList(prev1, prev2); | |
res[11] = prev; | |
List<String> next1 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f1.getActivity().getNext())); | |
List<String> next2 = InformationExtraction.getListofActivitySurfaceFromDb(StructureConverter.stringToList(f2.getActivity().getNext())); | |
double next = simActSurfaceList(next1, next2); | |
res[12] = next; | |
List<String> sub1 = InformationExtraction.getListofActivitySurfaceFromDb(f1.getActivity().getSubActivities()); | |
List<String> sub2 = InformationExtraction.getListofActivitySurfaceFromDb(f2.getActivity().getSubActivities()); | |
double sub = simActSurfaceList(sub1, sub2); | |
res[13] = sub; | |
return res; | |
} | |
//Get the final similarity value between two activity frames | |
public static double getSimilarity(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{ | |
double [] vector = getSimilarVector(cs, f1, f2); | |
double res = 0; | |
for (int i=0; i<vector.length; i++){ | |
res += vector[i] * Coefficient.ALL_COEF[i]; | |
} | |
res += Coefficient.INTERCEPT; | |
res = 1/(1 + Math.exp(-res)); | |
return res; | |
} | |
//Get the final similarity value between two activity frames | |
public static double getFullSimilarity(CategorySimilarity cs, WikiHowTaskFrame f1, WikiHowTaskFrame f2) throws Exception{ | |
double [] vector = getFullSimilarVector(cs, f1, f2); | |
double res = 0; | |
for (int i=0; i<vector.length; i++){ | |
res += vector[i] * Coefficient.FULL_COEF[i]; | |
} | |
res += Coefficient.INTERCEPT_FULL; | |
res = 1/(1 + Math.exp(-res)); | |
return res; | |
} | |
//========================Old part, with jaccard have threshold and lookup db | |
/*//similarity vector between two activities | |
public static double[] getSimilarVector(CategorySimilarity cs, ActivityFrame f1, ActivityFrame f2, double threshold) throws Exception{ | |
double[] res = new double[10]; | |
String v1 = f1.getActivity().getVerb(); | |
v1 = v1.contains(" ")?v1.split(" ")[0]:v1; | |
String v2 = f2.getActivity().getVerb(); | |
v2 = v2.contains(" ")?v2.split(" ")[0]:v2; | |
double v2v = simVerb(v1, v2); | |
res[0] = v2v; | |
String n1 = f1.getActivity().getObject(); | |
n1 = n1.contains(" ")?n1.split(" ")[n1.split(" ").length - 1]:n1; | |
String n2 = f2.getActivity().getObject(); | |
n2 = n2.contains(" ")?n2.split(" ")[n2.split(" ").length - 1]:n2; | |
double o2o = simNoun(n1, n2); | |
res[1] = o2o; | |
double c2c = simCategory(cs,Integer.parseInt(f1.getActivity().getCategoryID()), | |
Integer.parseInt(f2.getActivity().getCategoryID())); | |
res[2] = c2c; | |
double l2l = simLocationTimeAndObject(f1.getLocations(), f2.getLocations(), threshold); | |
res[3] = l2l; | |
double t2t = simLocationTimeAndObject(f1.getTemporal(), f2.getTemporal(), threshold); | |
res[4] = t2t; | |
double parto2parto = simLocationTimeAndObject(f1.getParticipatingObject(), f2.getParticipatingObject(), threshold); | |
res[5] = parto2parto; | |
double parta2parta = simList(f1.getParticipatingAgent(), f2.getParticipatingAgent(), threshold); | |
res[6] = parta2parta; | |
double ov2ov = simVerbPhrase(f1.getActivity().getOriVerb(), f2.getActivity().getOriObject(), threshold); | |
res[7] = ov2ov; | |
double oo2oo = simNounPhrase(f1.getActivity().getOriObject(), f2.getActivity().getOriObject(), threshold); | |
res[8] = oo2oo; | |
//v1*v2*n1*n2 | |
String a1 = f1.getActivity().getVerb() + ";"+ f1.getActivity().getObject(); | |
String a2 = f2.getActivity().getVerb() + ";" + f2.getActivity().getObject(); | |
double a1a2 = simActW2V(a1, a2); | |
res[9] = a1a2; | |
return res; | |
}*/ | |
/*public static void main(String[] args) throws Exception { | |
CategorySimilarity cs = new CategorySimilarity(); | |
//test category | |
System.out.println("Similarity of categories: " + simCategory(cs, 150, 150)); | |
//test word | |
String word1 = "set"; | |
String word2 = "dinner"; | |
System.out.println("Similarity of '" + word1 + "' and '" | |
+ word2 + "': " + simWord(word1, word2)); | |
//test list | |
List<String> l1 = new ArrayList<>(); | |
l1.add("book"); l1.add("dictionary"); l1.add("laptop"); | |
List<String> l2 = new ArrayList<>(); | |
l2.add("book"); l2.add("bottle"); l2.add("computer"); l2.add("diary"); | |
System.out.println("Similarity of two lists of string: " + simList(l1, l2, 0.7)); | |
//Test sim of two string | |
String s1 = "play football with friends"; | |
String s2 = "paint room with friends"; | |
System.out.println(simPhrase(s1, s2, 0.7)); | |
}*/ | |
} |