Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Update KB Organization: Clustering
- Loading branch information
Showing
41 changed files
with
3,822 additions
and
127 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
37 changes: 37 additions & 0 deletions
37
src/main/java/kb/howtokb/clustering/ActivityCachedSim.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package kb.howtokb.clustering; | ||
|
||
import edu.stanford.nlp.util.Pair; | ||
import kb.howtokb.utils.BijectiveMap; | ||
|
||
public class ActivityCachedSim { | ||
|
||
private BijectiveMap<String, Pair<Integer, Integer>> ids; | ||
private ActivityComponentSim vSim, nSim; | ||
|
||
public ActivityCachedSim(BijectiveMap<String, Pair<Integer, Integer>> ids, ActivityComponentSim vSim, | ||
ActivityComponentSim nSim) { | ||
this.ids = ids; | ||
this.vSim = vSim; | ||
this.nSim = nSim; | ||
} | ||
|
||
private boolean sim(Pair<Integer, Integer> e1, Pair<Integer, Integer> e2) { | ||
// return | ||
// // v1 v2 are similar | ||
// vSim.simFromCache(e1.first, e2.first) && | ||
// // n1 v2 are similar | ||
// nSim.simFromCache(e1.second, e2.second); | ||
|
||
if (!vSim.simFromCache(e1.first, e2.first) && | ||
// n1 v2 are similar | ||
!nSim.simFromCache(e1.second, e2.second)) | ||
return false; | ||
return true; | ||
} | ||
|
||
public boolean sim(String a1, String a2) { | ||
Pair<Integer, Integer> e1 = this.ids.getValueFromKey(a1); | ||
Pair<Integer, Integer> e2 = this.ids.getValueFromKey(a2); | ||
return sim(e1, e2); | ||
} | ||
} |
86 changes: 86 additions & 0 deletions
86
src/main/java/kb/howtokb/clustering/ActivityComponentSim.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package kb.howtokb.clustering; | ||
|
||
import java.util.Map; | ||
|
||
import kb.howtokb.clustering.sim.ActivityWordCategorySim.SparseSims; | ||
import kb.howtokb.utils.IDMap; | ||
|
||
public class ActivityComponentSim implements ISimilarity<Integer> { | ||
|
||
private double threshold = 0.0; | ||
private SparseSims sims; // cache. | ||
private IDMap<String, Integer> ids; | ||
private ISimilarity<String> word2vecSim; | ||
|
||
public ActivityComponentSim(double threshold, IDMap<String, Integer> ids, ISimilarity<String> word2vecSim) { | ||
this.threshold = threshold; | ||
this.sims = new SparseSims((float) this.threshold); | ||
this.ids = ids; | ||
this.word2vecSim = word2vecSim; | ||
computeAllPairsSim(); | ||
} | ||
|
||
private void computeAllPairsSim() { | ||
//Progress p = new Progress(1); | ||
Integer[] ids = this.ids.values().toArray(new Integer[0]); | ||
System.out.println("\n" + ids.length + " activity components. (one dot per activity neighborhood)"); | ||
for (int i = 0; i < ids.length; i++) { | ||
//p.next(); | ||
int e1 = ids[i]; | ||
for (int j = i; j < ids.length; j++) { | ||
// cache before returning the result. | ||
int e2 = ids[j]; | ||
sims.set(e1, e2, (float) sim(e1, e2)); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public double sim(Integer e1, Integer e2) { | ||
String word1 = ids.getKeyFromValue(e1); | ||
String word2 = ids.getKeyFromValue(e2); | ||
return word2vecSim.sim(word1, word2); | ||
} | ||
|
||
@Override | ||
public Map<Integer, Double> getNeighbors(Integer e) { | ||
System.err.println( | ||
"neighborhood for an integer " + "activity (noun or verb) not yet implemented."); | ||
return null; | ||
} | ||
|
||
@Override | ||
public boolean simThreshold(Integer e1, Integer e2, double minthreshold) throws Exception { | ||
return sim(e1, e2) >= minthreshold; | ||
} | ||
|
||
// Use this function once the object is completely constructed. | ||
/** | ||
* Also checks for e2,e1 | ||
* | ||
* @param e1 | ||
* @param e2 | ||
* @return if (e1,e2) are similar, return true. otherwise returns false | ||
*/ | ||
public boolean simFromCache(int e1, int e2) { | ||
return sims.get(e1, e2); | ||
} | ||
|
||
/** | ||
* Also checks for e2,e1 <br> | ||
* <b>NOTE: ensure that word1 and word2 are either both verbs or both noun. | ||
* And modified according to the ISim func. provided to the constructor</b> | ||
* | ||
* @param word1 | ||
* go away => v_go (we only lookup "go" in word2vec) | ||
* @param word2 | ||
* move => v_move | ||
* @return if (e1,e2) are similar, return true. otherwise returns false | ||
*/ | ||
public boolean simFromCache(String word1, String word2) { | ||
int e1 = ids.getValueFromKey(word1); | ||
int e2 = ids.getValueFromKey(word2); | ||
return sims.get(e1, e2); | ||
} | ||
|
||
} |
202 changes: 202 additions & 0 deletions
202
src/main/java/kb/howtokb/clustering/DataForClustering.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
package kb.howtokb.clustering; | ||
|
||
import java.util.List; | ||
|
||
import kb.howtokb.clustering.sim.CategorySimilarity; | ||
import kb.howtokb.clustering.sim.SimilarityComputation; | ||
import kb.howtokb.taskframe.WikiHowTaskFrame; | ||
import kb.howtokb.utils.AdjacencyBackedSparseMatrix; | ||
import kb.howtokb.utils.SparseSimMatrix; | ||
|
||
|
||
public class DataForClustering { | ||
|
||
/*Get similarity matrix of a cluster | ||
* s(i,j) is similarity between i and j | ||
* 0 at diagonal | ||
*/ | ||
public static double[][] getSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs) throws Exception{ | ||
double [][] res = new double[list.size()][list.size()]; | ||
for (int i=0; i<list.size()-1; i++){ | ||
for (int j=i+1; j<list.size(); j++){ | ||
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j)); | ||
res[i][j] = sim; | ||
res[j][i] = sim; | ||
System.out.print(sim + " "); | ||
} | ||
System.out.println(); | ||
} | ||
for (int i=0; i<list.size(); i++){ | ||
res[i][i] = 0; | ||
} | ||
return res; | ||
} | ||
|
||
/*Get similarity matrix of a cluster | ||
* s(i,j) is similarity between i and j | ||
* 1 at diagonal | ||
*/ | ||
public static double[][] getFullSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs) throws Exception{ | ||
double [][] res = new double[list.size()][list.size()]; | ||
for (int i=0; i<list.size()-1; i++){ | ||
for (int j=i+1; j<list.size(); j++){ | ||
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j)); | ||
res[i][j] = sim; | ||
res[j][i] = sim; | ||
System.out.print(sim + " "); | ||
} | ||
System.out.println(); | ||
} | ||
for (int i=0; i<list.size(); i++){ | ||
res[i][i] = 1; | ||
} | ||
return res; | ||
} | ||
|
||
/*Get sparse similarity matrix of a cluster | ||
* s(i,j) is similarity between i and j | ||
* | ||
*/ | ||
public static SparseSimMatrix getSparseSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs, double threshold) throws Exception{ | ||
SparseSimMatrix res = new SparseSimMatrix((float) threshold); | ||
for (int i=0; i<list.size()-1; i++){ | ||
for (int j=i+1; j<list.size(); j++){ | ||
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j)); | ||
res.set(i, j, (float)sim); | ||
System.out.print(sim + " "); | ||
} | ||
System.out.println(); | ||
} | ||
for (int i=0; i<list.size(); i++){ | ||
res.set(i, i, (float) 1.0); | ||
} | ||
return res; | ||
} | ||
|
||
/*Get dynamic sparse similarity matrix of a cluster | ||
* s(i,j) is similarity between i and j | ||
* | ||
*/ | ||
public static AdjacencyBackedSparseMatrix getAdjacencyBackedSparseSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs, double threshold) throws Exception{ | ||
int n = list.size(); | ||
AdjacencyBackedSparseMatrix res = new AdjacencyBackedSparseMatrix((float) threshold, n); | ||
for (int i=0; i<list.size()-1; i++){ | ||
for (int j=i+1; j<list.size(); j++){ | ||
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j)); | ||
res.set(i, j, (float)sim); | ||
System.out.print(sim + " "); | ||
} | ||
System.out.println(); | ||
} | ||
for (int i=0; i<list.size(); i++){ | ||
res.set(i, i, (float) 1.0); | ||
} | ||
return res; | ||
} | ||
|
||
// /* | ||
// * Get Unnormalized Laplacian Matrix | ||
// */ | ||
// public static double[][] getUnnormalizedLaplacianMatrix(double[][] simMatrix){ | ||
// double[][] res = new double[simMatrix.length][simMatrix.length]; | ||
// for (int i=0; i<simMatrix.length; i++){ | ||
// double temp = 0; | ||
// for (int j=0; j<simMatrix[i].length; j++){ | ||
// if (i != j){ | ||
// res[i][j] = 0 - simMatrix[i][j]; | ||
// temp += simMatrix[i][j]; | ||
// } | ||
// } | ||
// res[i][i] = temp; | ||
// } | ||
// return res; | ||
// } | ||
// | ||
// /* | ||
// * Get Normalized Laplacian Matrix | ||
// */ | ||
// public static double[][] getNormalizedLaplacianMatrix(double[][] simMatrix){ | ||
// double[][] res = getUnnormalizedLaplacianMatrix(simMatrix); | ||
// for (int i=0; i<res.length - 1; i++){ | ||
// for (int j=i+1; j < res.length; j++){ | ||
// double tmp = res[i][j] / Math.sqrt(res[i][i] * res[j][j]); | ||
// res[i][j] = tmp; | ||
// res[j][i] = tmp; | ||
// } | ||
// res[i][i] = 1; | ||
// } | ||
// res[res.length - 1][res.length - 1] = 1; | ||
// return res; | ||
// } | ||
// | ||
// /* | ||
// * EigenDecomposition | ||
// * Return matrix to do k-means | ||
// */ | ||
// public static double[][] getMatrixForKMean(double[][] laplacian, int k){ | ||
// double [][] res = new double[laplacian.length][k]; | ||
// | ||
// //Doing eigen decomposition | ||
// EigenvalueDecomposition eigen = new EigenvalueDecomposition( | ||
// new DenseDoubleMatrix2D(laplacian)); | ||
// double[][] U = eigen.getV().toArray(); | ||
// System.out.println(eigen.getV().toString()); | ||
// //Get only k eigenvectors corresponding to k smallest eigenvalues | ||
// // Y = (U_n U_n-1 ....U_n-k+1) where lamda_n <= lamda_n-1 <= ... <= lamda_n-k+1 | ||
// for (int i=0; i < U.length; i++){ | ||
// for (int j=U.length - 1; j >= (U.length - 1) - k + 1; j--){ | ||
// res[i][U.length - 1 - j] = U[i][j]; | ||
// } | ||
// } | ||
// | ||
//// //Normalize rows of Y to use for k-means on row of Y | ||
//// for (int i=0; i<res.length; i++){ | ||
//// double sum = Arrays.stream(res[i]).sum(); | ||
//// if (sum != 0){ | ||
//// for (int j=0; j<res[i].length; j++){ | ||
//// res[i][j] /= sum; | ||
//// System.out.print(res[i][j] + " "); | ||
//// } | ||
//// } | ||
//// System.out.println(); | ||
//// } | ||
// | ||
// //Norm 1 | ||
// double[] tmp = new double[res.length]; | ||
// for (int i=0; i<res.length; i++){ | ||
// tmp[i] = 0; | ||
// for (int j=0; j<res[i].length; j++){ | ||
// tmp[i] += Math.pow(res[i][j], 2); | ||
// } | ||
// tmp[i] = Math.sqrt(tmp[i]); | ||
// } | ||
// for (int i=0; i<res.length; i++){ | ||
// if (tmp[i] != 0){ | ||
// for (int j=0; j<res[i].length; j++){ | ||
// res[i][j] /= tmp[i]; | ||
// System.out.print(res[i][j] + " "); | ||
// } | ||
// } | ||
// System.out.println(); | ||
// } | ||
// | ||
// return res; | ||
// } | ||
// | ||
// | ||
// /* | ||
// * EigenDecomposition | ||
// * Return matrix to do k-means | ||
// */ | ||
// public static double[][] getMatrixForKMean(CategorySimilarity cs, List<WikiHowTaskFrame> list, boolean unnormalized, int k) throws Exception{ | ||
// //Get similarity matrix | ||
// double[][] simMatrix = getSimilarityMatrix(list, cs); | ||
// //Get laplacian matrix | ||
// double[][] laplacian; | ||
// if (unnormalized){ | ||
// laplacian = getUnnormalizedLaplacianMatrix(simMatrix); | ||
// }else laplacian = getNormalizedLaplacianMatrix(simMatrix); | ||
// | ||
// return getMatrixForKMean(laplacian, k); | ||
// } | ||
} |
Oops, something went wrong.