Skip to content
Permalink
ac4b295237
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
202 lines (171 sloc) 7.15 KB
package kb.howtokb.clustering;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import javatools.administrative.Announce;
import kb.howtokb.clustering.basicobj.ActivityWordsCategory;
import kb.howtokb.clustering.basicobj.CSKCluster;
import kb.howtokb.utils.AutoMap;
// Merging clusters.
public class HeuristicBottomupClustering implements IBottomUpClustering<CSKCluster<Integer, Integer>, ActivityWordsCategory> {
// As we do hard-clustering, each cluster can be part of at-most one super
// cluster. Therefore, while merging, maintain a visited flag
private Map<CSKCluster<Integer, Integer>, Boolean> visited;
private List<CSKCluster<Integer, Integer>> potentiallyMergeableClusters;
// TODO a very costly map.
private Map<Integer, ActivityWordsCategory> elems;
public HeuristicBottomupClustering(String activityTb) throws IOException {
// just store the pointer, we don't need to modify this data.
elems = new HashMap<>();
loadElemsFromDb(activityTb);
Announce.message("# initial instances: " + elems.size());
// Group on the strong norm form (e.g. paint a wall and paint a long wall)
AutoMap<ActivityWordsCategory, List<ActivityWordsCategory>> lexicalClusters = new AutoMap<>();
for (Entry<Integer, ActivityWordsCategory> e : elems.entrySet())
lexicalClusters.addArrayValue(e.getValue(), e.getValue());
// Construct clusters from automap.
this.potentiallyMergeableClusters = new ArrayList<>();
for (Entry<ActivityWordsCategory, List<ActivityWordsCategory>> e : lexicalClusters.entrySet()) {
CSKCluster<Integer, Integer> smallCluster = new CSKCluster<Integer, Integer>(
e.getKey().getId());
for (ActivityWordsCategory clusterMember : e.getValue())
smallCluster.addClusterMember(clusterMember.getId());
potentiallyMergeableClusters.add(smallCluster);
}
// by default none will be unvisited or visited because map is empty.
visited = new HashMap<>();
for (CSKCluster<Integer, Integer> smallCluster : potentiallyMergeableClusters) {
visited.put(smallCluster, false);
}
}
public HeuristicBottomupClustering(Map<Integer, ActivityWordsCategory> activityTb) throws IOException {
// just store the pointer, we don't need to modify this data.
elems = new HashMap<>();
elems = activityTb;
Announce.message("# initial instances: " + elems.size());
// Group on the strong norm form (e.g. paint a wall and paint a long wall)
AutoMap<ActivityWordsCategory, List<ActivityWordsCategory>> lexicalClusters = new AutoMap<>();
for (Entry<Integer, ActivityWordsCategory> e : elems.entrySet())
lexicalClusters.addArrayValue(e.getValue(), e.getValue());
// Construct clusters from automap.
this.potentiallyMergeableClusters = new ArrayList<>();
for (Entry<ActivityWordsCategory, List<ActivityWordsCategory>> e : lexicalClusters.entrySet()) {
CSKCluster<Integer, Integer> smallCluster = new CSKCluster<Integer, Integer>(
e.getKey().getId());
for (ActivityWordsCategory clusterMember : e.getValue())
smallCluster.addClusterMember(clusterMember.getId());
potentiallyMergeableClusters.add(smallCluster);
}
// by default none will be unvisited or visited because map is empty.
visited = new HashMap<>();
for (CSKCluster<Integer, Integer> smallCluster : potentiallyMergeableClusters) {
visited.put(smallCluster, false);
}
}
private void loadElemsFromDb(String activityTb) throws IOException {
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream(activityTb);
try (BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
String [] line = sCurrentLine.split("\t");
elems.put(Integer.parseInt(line[0]),
new ActivityWordsCategory(Integer.parseInt(line[0]), Integer.parseInt(line[1]), line[2]));
}
}
}
// Are these clusters brothers?
@Override
public boolean canMergeWith(CSKCluster<Integer, Integer> c1, CSKCluster<Integer, Integer> c2,
ISimilarity<ActivityWordsCategory> simFunc, double simThreshold) throws Exception {
// Cluster key = representative of the cluster.
return simFunc.simThreshold(elems.get(c1.getClusterKey()), elems.get(c2.getClusterKey()), simThreshold);
}
// create a super cluster.
// visit and merge.
public List<ActivitySuperCluster> cluster(ISimilarity<ActivityWordsCategory> simFunc, double simThreshold) throws Exception {
List<ActivitySuperCluster> merged = new ArrayList<>();
Announce.message("#initial clusters = " + potentiallyMergeableClusters.size());
//Progress p = new Progress(10);
for (CSKCluster<Integer, Integer> c1 : potentiallyMergeableClusters) {
//p.next();
// already associated with another supercluster?
if (visited.get(c1))
continue;
visited.put(c1, true);
// initialize the cluster.
ActivitySuperCluster unmerged = new ActivitySuperCluster();
unmerged.addCluster(c1);
for (CSKCluster<Integer, Integer> c2 : potentiallyMergeableClusters) {
if (visited.get(c2))
continue;
// Should c1 absorb the little cluster c2?
// TODO: think about a thresholding algorithm?
// TODO: think about early pruning to avoid canMergeWith for
// every pair.
if (canMergeWith(c1, c2, simFunc, simThreshold)) {
unmerged.addCluster(c2);
visited.put(c2, true);
}
}
// We are done with c1. No other cluster wants to associate with it
merged.add(unmerged);
}
return merged;
}
public static class ActivitySuperCluster {
// bunch of strongly normalized triples make the key.
private List<Integer> superClusterKeys;
// a list of all members of the supercluster
private List<Integer> superClusterMembers;
public List<Integer> getSuperClusterKeys() {
return superClusterKeys;
}
public List<Integer> getSuperClusterMembers() {
return superClusterMembers;
}
public ActivitySuperCluster() {
this.superClusterKeys = new ArrayList<>();
this.superClusterMembers = new ArrayList<>();
}
public void addCluster(CSKCluster<Integer, Integer> c) {
this.superClusterKeys.add(c.getClusterKey());
for (Integer smallClusterMember : c.getClusterMembers())
this.superClusterMembers.add(smallClusterMember);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((superClusterKeys == null) ? 0 : superClusterKeys.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
ActivitySuperCluster other = (ActivitySuperCluster) obj;
if (superClusterKeys == null) {
if (other.superClusterKeys != null)
return false;
} else if (!superClusterKeys.equals(other.superClusterKeys))
return false;
return true;
}
@Override
public String toString() {
return superClusterKeys.size() + "\tCSKSuperCluster [superClusterKeys=" + superClusterKeys
+ ", superClusterMembers=" + superClusterMembers + "]";
}
}
}