Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/kb/howtokb/clustering/HeuristicTopDownClusteringDynamicSparse.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
154 lines (130 sloc)
5.88 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.clustering; | |
import java.sql.SQLException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import kb.howtokb.clustering.basicobj.CSKSimpleCluster; | |
import kb.howtokb.clustering.basicobj.Instance; | |
import kb.howtokb.clustering.sim.CategorySimilarity; | |
import kb.howtokb.taskframe.WikiHowTaskFrame; | |
import kb.howtokb.utils.AdjacencyBackedSparseMatrix; | |
public class HeuristicTopDownClusteringDynamicSparse | |
implements ITopDownClustering<CSKSimpleCluster<WikiHowTaskFrame>, Instance<WikiHowTaskFrame>> { | |
private IStopping<CSKSimpleCluster<WikiHowTaskFrame>> stopper; | |
private AdjacencyBackedSparseMatrix simMatrix; | |
private CSKSimpleCluster<WikiHowTaskFrame> inputCluster; | |
public HeuristicTopDownClusteringDynamicSparse(List<WikiHowTaskFrame> pts, boolean isNormalizedCut, double threshold, int k, double thresforSparse) throws SQLException, Exception { | |
List<Instance<WikiHowTaskFrame>> ints = new ArrayList<>(); | |
for (int i=0; i<pts.size(); i++) | |
ints.add(new Instance<WikiHowTaskFrame>(i, pts.get(i))); | |
inputCluster = new CSKSimpleCluster<>(-1, ints); | |
simMatrix = DataForClustering.getAdjacencyBackedSparseSimilarityMatrix(pts, new CategorySimilarity(), thresforSparse); | |
System.out.println("================================================"); | |
this.stopper = new SimpleSimilarityStopping(threshold, simMatrix, isNormalizedCut); | |
} | |
public CSKSimpleCluster<WikiHowTaskFrame> getInputCluster() { | |
return inputCluster; | |
} | |
@Override | |
public boolean canSplitFrom(CSKSimpleCluster<WikiHowTaskFrame> c1, CSKSimpleCluster<WikiHowTaskFrame> c2, | |
ISimilarity<Instance<WikiHowTaskFrame>> simFunc, double simThreshold) { | |
// TODO Auto-generated method stub | |
return false; | |
} | |
@Override | |
public List<CSKSimpleCluster<WikiHowTaskFrame>> splitACluster(CSKSimpleCluster<WikiHowTaskFrame> c, int k) | |
throws Exception { | |
List<CSKSimpleCluster<WikiHowTaskFrame>> res = new ArrayList<>(); | |
List<Instance<WikiHowTaskFrame>> members = new ArrayList<>(c.getClusterMembers()); | |
// c.getClusterMembers(); | |
Instance<WikiHowTaskFrame> farthestPt = getFarthestPt(members); | |
CSKSimpleCluster<WikiHowTaskFrame> initCluster = new CSKSimpleCluster<>(-1, farthestPt); | |
members.remove(farthestPt); | |
CSKSimpleCluster<WikiHowTaskFrame> leftCluster = new CSKSimpleCluster<>(-1, members); | |
assignCluster(initCluster, leftCluster); | |
//System.out.println(initCluster.getClusterMembers().size() + " " + leftCluster.getClusterMembers().size()); | |
List<CSKSimpleCluster<WikiHowTaskFrame>> children = new ArrayList<>(); | |
children.add(initCluster); children.add(leftCluster); | |
if (stopper.split(c, children)){ | |
for (int i=0; i<children.size(); i++){ | |
if (children.get(i).getClusterMembers().size() > 1) | |
res.addAll(splitACluster(children.get(i), k)); | |
else res.add(children.get(i)); | |
} | |
}else{ | |
// System.out.println(c.getClusterMembers().size()); | |
res.add(c); | |
} | |
return res; | |
} | |
public Instance<WikiHowTaskFrame> getFarthestPt(List<Instance<WikiHowTaskFrame>> members){ | |
System.out.println("Find the farthest Pt...."); | |
Instance<WikiHowTaskFrame> farthestPt = new Instance<>(); | |
double min = Double.MAX_VALUE; | |
for (int i=0; i<members.size(); i++){ | |
double sum = 0; | |
for (int j=0; j<members.size(); j++){ | |
sum += simMatrix.get(members.get(i).getID(),members.get(j).getID()); | |
} | |
if (min > sum){ | |
min = sum; | |
farthestPt = members.get(i); | |
} | |
} | |
System.out.println("Finishing finding the farthest pt..."); | |
return farthestPt; | |
} | |
public void assignCluster(CSKSimpleCluster<WikiHowTaskFrame> initCluster, CSKSimpleCluster<WikiHowTaskFrame> leftCluster){ | |
System.out.println("Assign cluster....."); | |
List<Instance<WikiHowTaskFrame>> members = leftCluster.getClusterMembers(); | |
for (int i=0; i<members.size(); i++){ | |
if (simPtToCluster(members.get(i), initCluster) > simPtToCluster(members.get(i), leftCluster)){ | |
initCluster.addClusterMember(members.get(i)); | |
leftCluster.removeMember(members.get(i)); | |
i--; | |
} | |
} | |
System.out.println("Done assign....."); | |
} | |
public double simPtToCluster(Instance<WikiHowTaskFrame> pt, CSKSimpleCluster<WikiHowTaskFrame> cluster){ | |
List<Instance<WikiHowTaskFrame>> members = cluster.getClusterMembers(); | |
double sum = 0.0; | |
for (int i=0; i<members.size(); i++){ | |
sum += simMatrix.get(pt.getID(), members.get(i).getID()); | |
} | |
return sum/members.size(); | |
} | |
/** | |
* Stopping when inter-cluster similarity is greater than a given threshold | |
* @author cxchu | |
* | |
* @param <T> | |
*/ | |
public static class SimpleSimilarityStopping implements | |
IStopping<CSKSimpleCluster<WikiHowTaskFrame>> { | |
private double threshold; | |
private AdjacencyBackedSparseMatrix simMatrix; | |
private boolean isNormalizedCut; | |
public SimpleSimilarityStopping(double thres, AdjacencyBackedSparseMatrix simMatrix, boolean isNormalizedCut) { | |
this.threshold = thres; | |
this.simMatrix = simMatrix; | |
this.isNormalizedCut = isNormalizedCut; | |
} | |
@Override | |
public boolean split(CSKSimpleCluster<WikiHowTaskFrame> parent, List<CSKSimpleCluster<WikiHowTaskFrame>> children) { | |
if (parent.getClusterMembers().size() <= 1) return false; | |
if (isNormalizedCut) return split(children.get(0), children.get(1)); | |
return SimpleClusterSimilarity.averageIntraClusterSimilarity(parent, simMatrix) < this.threshold; | |
} | |
//Similar to minimizing min cut | |
public boolean split(CSKSimpleCluster<WikiHowTaskFrame> c1, CSKSimpleCluster<WikiHowTaskFrame> c2) { | |
double volC1 = SimpleClusterSimilarity.volOfCluster(c1, simMatrix); | |
double volC2 = SimpleClusterSimilarity.volOfCluster(c2, simMatrix); | |
double interC = SimpleClusterSimilarity.volOfInterClusters(c1, c2, simMatrix); | |
if (volC1 == 1 && volC2 == 1) return false; //Don't split into two singletons | |
if (volC1 == 1) return interC/volC2 < this.threshold; | |
if (volC2 == 1) return interC/volC1 < this.threshold; | |
System.out.println(volC1 + " " + volC2 + " " + interC); | |
return (interC/volC1 + interC/volC2) < this.threshold; | |
} | |
} | |
} |