Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
147 lines (123 sloc) 5.32 KB
package kb.howtokb.clustering;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import kb.howtokb.clustering.basicobj.CSKSimpleCluster;
import kb.howtokb.clustering.basicobj.Instance;
import kb.howtokb.clustering.sim.CategorySimilarity;
import kb.howtokb.taskframe.WikiHowTaskFrame;
public class HeuristicTopDownClustering
implements ITopDownClustering<CSKSimpleCluster<WikiHowTaskFrame>, Instance<WikiHowTaskFrame>> {
private IStopping<CSKSimpleCluster<WikiHowTaskFrame>> stopper;
private double[][] simMatrix;
private CSKSimpleCluster<WikiHowTaskFrame> inputCluster;
public HeuristicTopDownClustering(List<WikiHowTaskFrame> pts, boolean isNormalizedCut, double threshold, int k) throws SQLException, Exception {
List<Instance<WikiHowTaskFrame>> ints = new ArrayList<>();
for (int i=0; i<pts.size(); i++)
ints.add(new Instance<WikiHowTaskFrame>(i, pts.get(i)));
inputCluster = new CSKSimpleCluster<>(-1, ints);
simMatrix = DataForClustering.getFullSimilarityMatrix(pts, new CategorySimilarity());
this.stopper = new SimpleSimilarityStopping(threshold, simMatrix, isNormalizedCut);
}
public CSKSimpleCluster<WikiHowTaskFrame> getInputCluster() {
return inputCluster;
}
@Override
public boolean canSplitFrom(CSKSimpleCluster<WikiHowTaskFrame> c1, CSKSimpleCluster<WikiHowTaskFrame> c2,
ISimilarity<Instance<WikiHowTaskFrame>> simFunc, double simThreshold) {
// TODO Auto-generated method stub
return false;
}
@Override
public List<CSKSimpleCluster<WikiHowTaskFrame>> splitACluster(CSKSimpleCluster<WikiHowTaskFrame> c, int k)
throws Exception {
List<CSKSimpleCluster<WikiHowTaskFrame>> res = new ArrayList<>();
List<Instance<WikiHowTaskFrame>> members = new ArrayList<>(c.getClusterMembers());
// c.getClusterMembers();
Instance<WikiHowTaskFrame> farthestPt = getFarthestPt(members);
CSKSimpleCluster<WikiHowTaskFrame> initCluster = new CSKSimpleCluster<>(-1, farthestPt);
members.remove(farthestPt);
CSKSimpleCluster<WikiHowTaskFrame> leftCluster = new CSKSimpleCluster<>(-1, members);
assignCluster(initCluster, leftCluster);
List<CSKSimpleCluster<WikiHowTaskFrame>> children = new ArrayList<>();
children.add(initCluster); children.add(leftCluster);
if (stopper.split(c, children)){
for (int i=0; i<children.size(); i++){
if (children.get(i).getClusterMembers().size() > 1)
res.addAll(splitACluster(children.get(i), k));
else res.add(children.get(i));
}
}else{
// System.out.println(c.getClusterMembers().size());
res.add(c);
}
return res;
}
public Instance<WikiHowTaskFrame> getFarthestPt(List<Instance<WikiHowTaskFrame>> members){
Instance<WikiHowTaskFrame> farthestPt = new Instance<>();
double min = Double.MAX_VALUE;
for (int i=0; i<members.size(); i++){
double sum = 0;
for (int j=0; j<members.size(); j++){
sum += simMatrix[members.get(i).getID()][members.get(j).getID()];
}
if (min > sum){
min = sum;
farthestPt = members.get(i);
}
}
return farthestPt;
}
public void assignCluster(CSKSimpleCluster<WikiHowTaskFrame> initCluster, CSKSimpleCluster<WikiHowTaskFrame> leftCluster){
List<Instance<WikiHowTaskFrame>> members = leftCluster.getClusterMembers();
for (int i=0; i<members.size(); i++){
if (simPtToCluster(members.get(i), initCluster) > simPtToCluster(members.get(i), leftCluster)){
initCluster.addClusterMember(members.get(i));
leftCluster.removeMember(members.get(i));
i--;
}
}
}
public double simPtToCluster(Instance<WikiHowTaskFrame> pt, CSKSimpleCluster<WikiHowTaskFrame> cluster){
List<Instance<WikiHowTaskFrame>> members = cluster.getClusterMembers();
double sum = 0.0;
for (int i=0; i<members.size(); i++){
sum += simMatrix[pt.getID()][members.get(i).getID()];
}
return sum/members.size();
}
/**
* Stopping when inter-cluster similarity is greater than a given threshold
* @author cxchu
*
* @param <T>
*/
public static class SimpleSimilarityStopping implements
IStopping<CSKSimpleCluster<WikiHowTaskFrame>> {
private double threshold;
private double[][] simMatrix;
private boolean isNormalizedCut;
public SimpleSimilarityStopping(double thres, double [][] simMatrix, boolean isNormalizedCut) {
this.threshold = thres;
this.simMatrix = simMatrix;
this.isNormalizedCut = isNormalizedCut;
}
@Override
public boolean split(CSKSimpleCluster<WikiHowTaskFrame> parent, List<CSKSimpleCluster<WikiHowTaskFrame>> children) {
if (parent.getClusterMembers().size() <= 1) return false;
if (isNormalizedCut) return split(children.get(0), children.get(1));
return SimpleClusterSimilarity.averageIntraClusterSimilarity(parent, simMatrix) < this.threshold;
}
//Similar to minimizing min cut
public boolean split(CSKSimpleCluster<WikiHowTaskFrame> c1, CSKSimpleCluster<WikiHowTaskFrame> c2) {
double volC1 = SimpleClusterSimilarity.volOfCluster(c1, simMatrix);
double volC2 = SimpleClusterSimilarity.volOfCluster(c2, simMatrix);
double interC = SimpleClusterSimilarity.volOfInterClusters(c1, c2, simMatrix);
if (volC1 == 1 && volC2 == 1) return false; //Don't split into two singletons
if (volC1 == 1) return interC/volC2 < this.threshold;
if (volC2 == 1) return interC/volC1 < this.threshold;
System.out.println(volC1 + " " + volC2 + " " + interC);
return (interC/volC1 + interC/volC2) < this.threshold;
}
}
}