Skip to content
Permalink
3b99e145f5
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
201 lines (170 sloc) 6.96 KB
package clustering;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import activity.ActivityWordsCategory;
import clustering.ds.CSKCluster;
import clustering.sim.w2v.Word2VecRunner;
import javatools.administrative.Announce;
import util.AutoMap;
import util.Util.Progress;
// Merging clusters.
public class HeuristicBottomupClustering implements IBottomUpClustering<CSKCluster<Integer, Integer>, ActivityWordsCategory> {
// As we do hard-clustering, each cluster can be part of at-most one super
// cluster. Therefore, while merging, maintain a visited flag
private Map<CSKCluster<Integer, Integer>, Boolean> visited;
private List<CSKCluster<Integer, Integer>> potentiallyMergeableClusters;
// TODO a very costly map.
private Map<Integer, ActivityWordsCategory> elems;
public HeuristicBottomupClustering(String activityTb) throws IOException {
// just store the pointer, we don't need to modify this data.
elems = new HashMap<>();
loadElemsFromDb(activityTb);
Announce.message("# initial instances: " + elems.size());
// Group on the strong norm form (e.g. paint a wall and paint a long wall)
AutoMap<ActivityWordsCategory, List<ActivityWordsCategory>> lexicalClusters = new AutoMap<>();
for (Entry<Integer, ActivityWordsCategory> e : elems.entrySet())
lexicalClusters.addArrayValue(e.getValue(), e.getValue());
// Construct clusters from automap.
this.potentiallyMergeableClusters = new ArrayList<>();
for (Entry<ActivityWordsCategory, List<ActivityWordsCategory>> e : lexicalClusters.entrySet()) {
CSKCluster<Integer, Integer> smallCluster = new CSKCluster<Integer, Integer>(
e.getKey().getId());
for (ActivityWordsCategory clusterMember : e.getValue())
smallCluster.addClusterMember(clusterMember.getId());
potentiallyMergeableClusters.add(smallCluster);
}
// by default none will be unvisited or visited because map is empty.
visited = new HashMap<>();
for (CSKCluster<Integer, Integer> smallCluster : potentiallyMergeableClusters) {
visited.put(smallCluster, false);
}
}
public HeuristicBottomupClustering(Map<Integer, ActivityWordsCategory> activityTb) throws IOException {
// just store the pointer, we don't need to modify this data.
elems = new HashMap<>();
elems = activityTb;
Announce.message("# initial instances: " + elems.size());
// Group on the strong norm form (e.g. paint a wall and paint a long wall)
AutoMap<ActivityWordsCategory, List<ActivityWordsCategory>> lexicalClusters = new AutoMap<>();
for (Entry<Integer, ActivityWordsCategory> e : elems.entrySet())
lexicalClusters.addArrayValue(e.getValue(), e.getValue());
// Construct clusters from automap.
this.potentiallyMergeableClusters = new ArrayList<>();
for (Entry<ActivityWordsCategory, List<ActivityWordsCategory>> e : lexicalClusters.entrySet()) {
CSKCluster<Integer, Integer> smallCluster = new CSKCluster<Integer, Integer>(
e.getKey().getId());
for (ActivityWordsCategory clusterMember : e.getValue())
smallCluster.addClusterMember(clusterMember.getId());
potentiallyMergeableClusters.add(smallCluster);
}
// by default none will be unvisited or visited because map is empty.
visited = new HashMap<>();
for (CSKCluster<Integer, Integer> smallCluster : potentiallyMergeableClusters) {
visited.put(smallCluster, false);
}
}
private void loadElemsFromDb(String activityTb) throws IOException {
try (BufferedReader br = new BufferedReader(new FileReader(activityTb))) {
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
String [] line = sCurrentLine.split("\t");
elems.put(Integer.parseInt(line[0]),
new ActivityWordsCategory(Integer.parseInt(line[0]), Integer.parseInt(line[1]), line[2]));
}
}
}
// Are these clusters brothers?
@Override
public boolean canMergeWith(CSKCluster<Integer, Integer> c1, CSKCluster<Integer, Integer> c2,
ISimilarity<ActivityWordsCategory> simFunc, double simThreshold) throws Exception {
// Cluster key = representative of the cluster.
return simFunc.simThreshold(elems.get(c1.getClusterKey()), elems.get(c2.getClusterKey()), simThreshold);
}
// create a super cluster.
// visit and merge.
public List<ActivitySuperCluster> cluster(ISimilarity<ActivityWordsCategory> simFunc, double simThreshold) throws Exception {
List<ActivitySuperCluster> merged = new ArrayList<>();
Announce.message("#initial clusters = " + potentiallyMergeableClusters.size());
Progress p = new Progress(10);
for (CSKCluster<Integer, Integer> c1 : potentiallyMergeableClusters) {
p.next();
// already associated with another supercluster?
if (visited.get(c1))
continue;
visited.put(c1, true);
// initialize the cluster.
ActivitySuperCluster unmerged = new ActivitySuperCluster();
unmerged.addCluster(c1);
for (CSKCluster<Integer, Integer> c2 : potentiallyMergeableClusters) {
if (visited.get(c2))
continue;
// Should c1 absorb the little cluster c2?
// TODO: think about a thresholding algorithm?
// TODO: think about early pruning to avoid canMergeWith for
// every pair.
if (canMergeWith(c1, c2, simFunc, simThreshold)) {
unmerged.addCluster(c2);
visited.put(c2, true);
}
}
// We are done with c1. No other cluster wants to associate with it
merged.add(unmerged);
}
return merged;
}
public static class ActivitySuperCluster {
// bunch of strongly normalized triples make the key.
private List<Integer> superClusterKeys;
// a list of all members of the supercluster
private List<Integer> superClusterMembers;
public List<Integer> getSuperClusterKeys() {
return superClusterKeys;
}
public List<Integer> getSuperClusterMembers() {
return superClusterMembers;
}
public ActivitySuperCluster() {
this.superClusterKeys = new ArrayList<>();
this.superClusterMembers = new ArrayList<>();
}
public void addCluster(CSKCluster<Integer, Integer> c) {
this.superClusterKeys.add(c.getClusterKey());
for (Integer smallClusterMember : c.getClusterMembers())
this.superClusterMembers.add(smallClusterMember);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((superClusterKeys == null) ? 0 : superClusterKeys.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ActivitySuperCluster other = (ActivitySuperCluster) obj;
if (superClusterKeys == null) {
if (other.superClusterKeys != null)
return false;
} else if (!superClusterKeys.equals(other.superClusterKeys))
return false;
return true;
}
@Override
public String toString() {
return superClusterKeys.size() + "\tCSKSuperCluster [superClusterKeys=" + superClusterKeys
+ ", superClusterMembers=" + superClusterMembers + "]";
}
}
}