Skip to content
Permalink
Browse files

Update KB Organization: Clustering

  • Loading branch information
cxchu committed Mar 14, 2017
1 parent ce0c76d commit d63876f642d6a6e9e43d20b5f6f83b03b58291a2
Showing with 3,822 additions and 127 deletions.
  1. BIN jars/javatools-1.0.0.jar
  2. BIN jars/json-simple-1.1.1.jar
  3. +37 −0 src/main/java/kb/howtokb/clustering/ActivityCachedSim.java
  4. +86 −0 src/main/java/kb/howtokb/clustering/ActivityComponentSim.java
  5. +202 −0 src/main/java/kb/howtokb/clustering/DataForClustering.java
  6. +199 −0 src/main/java/kb/howtokb/clustering/HeuristicBottomupClustering.java
  7. +147 −0 src/main/java/kb/howtokb/clustering/HeuristicTopDownClustering.java
  8. +154 −0 src/main/java/kb/howtokb/clustering/HeuristicTopDownClusteringDynamicSparse.java
  9. +8 −0 src/main/java/kb/howtokb/clustering/IBottomUpClustering.java
  10. +17 −0 src/main/java/kb/howtokb/clustering/ISimilarity.java
  11. +28 −0 src/main/java/kb/howtokb/clustering/ITopDownClustering.java
  12. +171 −0 src/main/java/kb/howtokb/clustering/SimpleClusterSimilarity.java
  13. +141 −0 src/main/java/kb/howtokb/clustering/SimplePruningSimilarity.java
  14. +58 −0 src/main/java/kb/howtokb/clustering/basicobj/ActivityWordsCategory.java
  15. +6 −0 src/main/java/kb/howtokb/clustering/basicobj/BasicDataPt.java
  16. +85 −0 src/main/java/kb/howtokb/clustering/basicobj/CSKCluster.java
  17. +23 −0 src/main/java/kb/howtokb/clustering/basicobj/CSKSimpleCluster.java
  18. +40 −0 src/main/java/kb/howtokb/clustering/basicobj/Instance.java
  19. +83 −0 src/main/java/kb/howtokb/clustering/sim/ActivityWordCategorySim.java
  20. +174 −0 src/main/java/kb/howtokb/clustering/sim/CategorySimilarity.java
  21. +53 −0 src/main/java/kb/howtokb/clustering/sim/Coefficient.java
  22. +312 −0 src/main/java/kb/howtokb/clustering/sim/SimilarityComputation.java
  23. +224 −0 src/main/java/kb/howtokb/clustering/sim/StringSimilarity.java
  24. +451 −0 src/main/java/kb/howtokb/clustering/sim/w2v/Word2VecRunner.java
  25. +174 −0 src/main/java/kb/howtokb/clustering/sim/w2v/Word2VecSimilarity.java
  26. +59 −62 src/main/java/kb/howtokb/global/Global.java
  27. +7 −2 src/main/java/kb/howtokb/taskframe/extractor/TextToOpenIEResult.java
  28. +12 −6 src/main/java/kb/howtokb/taskframe/extractor/{TextToActivity.java → TextToWikiHowTaskFrame.java}
  29. +42 −1 src/main/java/kb/howtokb/tools/InformationExtraction.java
  30. +33 −22 src/main/java/kb/howtokb/tools/NormalizationText.java
  31. +26 −0 src/main/java/kb/howtokb/tools/StructureConverter.java
  32. +31 −0 src/main/java/kb/howtokb/utils/AdjacencyBackedSparseMatrix.java
  33. +49 −0 src/main/java/kb/howtokb/utils/BijectiveMap.java
  34. +25 −0 src/main/java/kb/howtokb/utils/IDMap.java
  35. +91 −0 src/main/java/kb/howtokb/utils/SQLiteJDBCConnector.java
  36. +356 −0 src/main/java/kb/howtokb/utils/SortedMultiMap.java
  37. +34 −0 src/main/java/kb/howtokb/utils/SparseSimMatrix.java
  38. +34 −34 src/test/java/kb/howtokb/{TextToActivityTest.java → TextToWikiHowTaskFrameTest.java}
  39. +58 −0 src/test/java/kb/howtokb/clustering/HeuristicBottomUpClusteringTest.java
  40. +41 −0 src/test/java/kb/howtokb/clustering/HeuristicTopDownClusteringDynamicSparseTest.java
  41. +51 −0 src/test/java/kb/howtokb/clustering/HeuristicTopDownClusteringTest.java
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,37 @@
package kb.howtokb.clustering;

import edu.stanford.nlp.util.Pair;
import kb.howtokb.utils.BijectiveMap;

public class ActivityCachedSim {

private BijectiveMap<String, Pair<Integer, Integer>> ids;
private ActivityComponentSim vSim, nSim;

public ActivityCachedSim(BijectiveMap<String, Pair<Integer, Integer>> ids, ActivityComponentSim vSim,
ActivityComponentSim nSim) {
this.ids = ids;
this.vSim = vSim;
this.nSim = nSim;
}

private boolean sim(Pair<Integer, Integer> e1, Pair<Integer, Integer> e2) {
// return
// // v1 v2 are similar
// vSim.simFromCache(e1.first, e2.first) &&
// // n1 v2 are similar
// nSim.simFromCache(e1.second, e2.second);

if (!vSim.simFromCache(e1.first, e2.first) &&
// n1 v2 are similar
!nSim.simFromCache(e1.second, e2.second))
return false;
return true;
}

public boolean sim(String a1, String a2) {
Pair<Integer, Integer> e1 = this.ids.getValueFromKey(a1);
Pair<Integer, Integer> e2 = this.ids.getValueFromKey(a2);
return sim(e1, e2);
}
}
@@ -0,0 +1,86 @@
package kb.howtokb.clustering;

import java.util.Map;

import kb.howtokb.clustering.sim.ActivityWordCategorySim.SparseSims;
import kb.howtokb.utils.IDMap;

public class ActivityComponentSim implements ISimilarity<Integer> {

private double threshold = 0.0;
private SparseSims sims; // cache.
private IDMap<String, Integer> ids;
private ISimilarity<String> word2vecSim;

public ActivityComponentSim(double threshold, IDMap<String, Integer> ids, ISimilarity<String> word2vecSim) {
this.threshold = threshold;
this.sims = new SparseSims((float) this.threshold);
this.ids = ids;
this.word2vecSim = word2vecSim;
computeAllPairsSim();
}

private void computeAllPairsSim() {
//Progress p = new Progress(1);
Integer[] ids = this.ids.values().toArray(new Integer[0]);
System.out.println("\n" + ids.length + " activity components. (one dot per activity neighborhood)");
for (int i = 0; i < ids.length; i++) {
//p.next();
int e1 = ids[i];
for (int j = i; j < ids.length; j++) {
// cache before returning the result.
int e2 = ids[j];
sims.set(e1, e2, (float) sim(e1, e2));
}
}
}

@Override
public double sim(Integer e1, Integer e2) {
String word1 = ids.getKeyFromValue(e1);
String word2 = ids.getKeyFromValue(e2);
return word2vecSim.sim(word1, word2);
}

@Override
public Map<Integer, Double> getNeighbors(Integer e) {
System.err.println(
"neighborhood for an integer " + "activity (noun or verb) not yet implemented.");
return null;
}

@Override
public boolean simThreshold(Integer e1, Integer e2, double minthreshold) throws Exception {
return sim(e1, e2) >= minthreshold;
}

// Use this function once the object is completely constructed.
/**
* Also checks for e2,e1
*
* @param e1
* @param e2
* @return if (e1,e2) are similar, return true. otherwise returns false
*/
public boolean simFromCache(int e1, int e2) {
return sims.get(e1, e2);
}

/**
* Also checks for e2,e1 <br>
* <b>NOTE: ensure that word1 and word2 are either both verbs or both noun.
* And modified according to the ISim func. provided to the constructor</b>
*
* @param word1
* go away => v_go (we only lookup "go" in word2vec)
* @param word2
* move => v_move
* @return if (e1,e2) are similar, return true. otherwise returns false
*/
public boolean simFromCache(String word1, String word2) {
int e1 = ids.getValueFromKey(word1);
int e2 = ids.getValueFromKey(word2);
return sims.get(e1, e2);
}

}
@@ -0,0 +1,202 @@
package kb.howtokb.clustering;

import java.util.List;

import kb.howtokb.clustering.sim.CategorySimilarity;
import kb.howtokb.clustering.sim.SimilarityComputation;
import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.utils.AdjacencyBackedSparseMatrix;
import kb.howtokb.utils.SparseSimMatrix;


public class DataForClustering {

/*Get similarity matrix of a cluster
* s(i,j) is similarity between i and j
* 0 at diagonal
*/
public static double[][] getSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs) throws Exception{
double [][] res = new double[list.size()][list.size()];
for (int i=0; i<list.size()-1; i++){
for (int j=i+1; j<list.size(); j++){
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j));
res[i][j] = sim;
res[j][i] = sim;
System.out.print(sim + " ");
}
System.out.println();
}
for (int i=0; i<list.size(); i++){
res[i][i] = 0;
}
return res;
}

/*Get similarity matrix of a cluster
* s(i,j) is similarity between i and j
* 1 at diagonal
*/
public static double[][] getFullSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs) throws Exception{
double [][] res = new double[list.size()][list.size()];
for (int i=0; i<list.size()-1; i++){
for (int j=i+1; j<list.size(); j++){
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j));
res[i][j] = sim;
res[j][i] = sim;
System.out.print(sim + " ");
}
System.out.println();
}
for (int i=0; i<list.size(); i++){
res[i][i] = 1;
}
return res;
}

/*Get sparse similarity matrix of a cluster
* s(i,j) is similarity between i and j
*
*/
public static SparseSimMatrix getSparseSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs, double threshold) throws Exception{
SparseSimMatrix res = new SparseSimMatrix((float) threshold);
for (int i=0; i<list.size()-1; i++){
for (int j=i+1; j<list.size(); j++){
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j));
res.set(i, j, (float)sim);
System.out.print(sim + " ");
}
System.out.println();
}
for (int i=0; i<list.size(); i++){
res.set(i, i, (float) 1.0);
}
return res;
}

/*Get dynamic sparse similarity matrix of a cluster
* s(i,j) is similarity between i and j
*
*/
public static AdjacencyBackedSparseMatrix getAdjacencyBackedSparseSimilarityMatrix(List<WikiHowTaskFrame> list, CategorySimilarity cs, double threshold) throws Exception{
int n = list.size();
AdjacencyBackedSparseMatrix res = new AdjacencyBackedSparseMatrix((float) threshold, n);
for (int i=0; i<list.size()-1; i++){
for (int j=i+1; j<list.size(); j++){
double sim = SimilarityComputation.getSimilarity(cs, list.get(i), list.get(j));
res.set(i, j, (float)sim);
System.out.print(sim + " ");
}
System.out.println();
}
for (int i=0; i<list.size(); i++){
res.set(i, i, (float) 1.0);
}
return res;
}

// /*
// * Get Unnormalized Laplacian Matrix
// */
// public static double[][] getUnnormalizedLaplacianMatrix(double[][] simMatrix){
// double[][] res = new double[simMatrix.length][simMatrix.length];
// for (int i=0; i<simMatrix.length; i++){
// double temp = 0;
// for (int j=0; j<simMatrix[i].length; j++){
// if (i != j){
// res[i][j] = 0 - simMatrix[i][j];
// temp += simMatrix[i][j];
// }
// }
// res[i][i] = temp;
// }
// return res;
// }
//
// /*
// * Get Normalized Laplacian Matrix
// */
// public static double[][] getNormalizedLaplacianMatrix(double[][] simMatrix){
// double[][] res = getUnnormalizedLaplacianMatrix(simMatrix);
// for (int i=0; i<res.length - 1; i++){
// for (int j=i+1; j < res.length; j++){
// double tmp = res[i][j] / Math.sqrt(res[i][i] * res[j][j]);
// res[i][j] = tmp;
// res[j][i] = tmp;
// }
// res[i][i] = 1;
// }
// res[res.length - 1][res.length - 1] = 1;
// return res;
// }
//
// /*
// * EigenDecomposition
// * Return matrix to do k-means
// */
// public static double[][] getMatrixForKMean(double[][] laplacian, int k){
// double [][] res = new double[laplacian.length][k];
//
// //Doing eigen decomposition
// EigenvalueDecomposition eigen = new EigenvalueDecomposition(
// new DenseDoubleMatrix2D(laplacian));
// double[][] U = eigen.getV().toArray();
// System.out.println(eigen.getV().toString());
// //Get only k eigenvectors corresponding to k smallest eigenvalues
// // Y = (U_n U_n-1 ....U_n-k+1) where lamda_n <= lamda_n-1 <= ... <= lamda_n-k+1
// for (int i=0; i < U.length; i++){
// for (int j=U.length - 1; j >= (U.length - 1) - k + 1; j--){
// res[i][U.length - 1 - j] = U[i][j];
// }
// }
//
//// //Normalize rows of Y to use for k-means on row of Y
//// for (int i=0; i<res.length; i++){
//// double sum = Arrays.stream(res[i]).sum();
//// if (sum != 0){
//// for (int j=0; j<res[i].length; j++){
//// res[i][j] /= sum;
//// System.out.print(res[i][j] + " ");
//// }
//// }
//// System.out.println();
//// }
//
// //Norm 1
// double[] tmp = new double[res.length];
// for (int i=0; i<res.length; i++){
// tmp[i] = 0;
// for (int j=0; j<res[i].length; j++){
// tmp[i] += Math.pow(res[i][j], 2);
// }
// tmp[i] = Math.sqrt(tmp[i]);
// }
// for (int i=0; i<res.length; i++){
// if (tmp[i] != 0){
// for (int j=0; j<res[i].length; j++){
// res[i][j] /= tmp[i];
// System.out.print(res[i][j] + " ");
// }
// }
// System.out.println();
// }
//
// return res;
// }
//
//
// /*
// * EigenDecomposition
// * Return matrix to do k-means
// */
// public static double[][] getMatrixForKMean(CategorySimilarity cs, List<WikiHowTaskFrame> list, boolean unnormalized, int k) throws Exception{
// //Get similarity matrix
// double[][] simMatrix = getSimilarityMatrix(list, cs);
// //Get laplacian matrix
// double[][] laplacian;
// if (unnormalized){
// laplacian = getUnnormalizedLaplacianMatrix(simMatrix);
// }else laplacian = getNormalizedLaplacianMatrix(simMatrix);
//
// return getMatrixForKMean(laplacian, k);
// }
}

0 comments on commit d63876f

Please sign in to comment.
You can’t perform that action at this time.