Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/main/java/kb/howtokb/clustering/HeuristicBottomupClustering.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
202 lines (171 sloc)
7.15 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.clustering; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import javatools.administrative.Announce; | |
import kb.howtokb.clustering.basicobj.ActivityWordsCategory; | |
import kb.howtokb.clustering.basicobj.CSKCluster; | |
import kb.howtokb.utils.AutoMap; | |
// Merging clusters. | |
public class HeuristicBottomupClustering implements IBottomUpClustering<CSKCluster<Integer, Integer>, ActivityWordsCategory> { | |
// As we do hard-clustering, each cluster can be part of at-most one super | |
// cluster. Therefore, while merging, maintain a visited flag | |
private Map<CSKCluster<Integer, Integer>, Boolean> visited; | |
private List<CSKCluster<Integer, Integer>> potentiallyMergeableClusters; | |
// TODO a very costly map. | |
private Map<Integer, ActivityWordsCategory> elems; | |
public HeuristicBottomupClustering(String activityTb) throws IOException { | |
// just store the pointer, we don't need to modify this data. | |
elems = new HashMap<>(); | |
loadElemsFromDb(activityTb); | |
Announce.message("# initial instances: " + elems.size()); | |
// Group on the strong norm form (e.g. paint a wall and paint a long wall) | |
AutoMap<ActivityWordsCategory, List<ActivityWordsCategory>> lexicalClusters = new AutoMap<>(); | |
for (Entry<Integer, ActivityWordsCategory> e : elems.entrySet()) | |
lexicalClusters.addArrayValue(e.getValue(), e.getValue()); | |
// Construct clusters from automap. | |
this.potentiallyMergeableClusters = new ArrayList<>(); | |
for (Entry<ActivityWordsCategory, List<ActivityWordsCategory>> e : lexicalClusters.entrySet()) { | |
CSKCluster<Integer, Integer> smallCluster = new CSKCluster<Integer, Integer>( | |
e.getKey().getId()); | |
for (ActivityWordsCategory clusterMember : e.getValue()) | |
smallCluster.addClusterMember(clusterMember.getId()); | |
potentiallyMergeableClusters.add(smallCluster); | |
} | |
// by default none will be unvisited or visited because map is empty. | |
visited = new HashMap<>(); | |
for (CSKCluster<Integer, Integer> smallCluster : potentiallyMergeableClusters) { | |
visited.put(smallCluster, false); | |
} | |
} | |
public HeuristicBottomupClustering(Map<Integer, ActivityWordsCategory> activityTb) throws IOException { | |
// just store the pointer, we don't need to modify this data. | |
elems = new HashMap<>(); | |
elems = activityTb; | |
Announce.message("# initial instances: " + elems.size()); | |
// Group on the strong norm form (e.g. paint a wall and paint a long wall) | |
AutoMap<ActivityWordsCategory, List<ActivityWordsCategory>> lexicalClusters = new AutoMap<>(); | |
for (Entry<Integer, ActivityWordsCategory> e : elems.entrySet()) | |
lexicalClusters.addArrayValue(e.getValue(), e.getValue()); | |
// Construct clusters from automap. | |
this.potentiallyMergeableClusters = new ArrayList<>(); | |
for (Entry<ActivityWordsCategory, List<ActivityWordsCategory>> e : lexicalClusters.entrySet()) { | |
CSKCluster<Integer, Integer> smallCluster = new CSKCluster<Integer, Integer>( | |
e.getKey().getId()); | |
for (ActivityWordsCategory clusterMember : e.getValue()) | |
smallCluster.addClusterMember(clusterMember.getId()); | |
potentiallyMergeableClusters.add(smallCluster); | |
} | |
// by default none will be unvisited or visited because map is empty. | |
visited = new HashMap<>(); | |
for (CSKCluster<Integer, Integer> smallCluster : potentiallyMergeableClusters) { | |
visited.put(smallCluster, false); | |
} | |
} | |
private void loadElemsFromDb(String activityTb) throws IOException { | |
ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); | |
InputStream inputs = classLoader.getResourceAsStream(activityTb); | |
try (BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) { | |
String sCurrentLine; | |
while ((sCurrentLine = br.readLine()) != null) { | |
String [] line = sCurrentLine.split("\t"); | |
elems.put(Integer.parseInt(line[0]), | |
new ActivityWordsCategory(Integer.parseInt(line[0]), Integer.parseInt(line[1]), line[2])); | |
} | |
} | |
} | |
// Are these clusters brothers? | |
@Override | |
public boolean canMergeWith(CSKCluster<Integer, Integer> c1, CSKCluster<Integer, Integer> c2, | |
ISimilarity<ActivityWordsCategory> simFunc, double simThreshold) throws Exception { | |
// Cluster key = representative of the cluster. | |
return simFunc.simThreshold(elems.get(c1.getClusterKey()), elems.get(c2.getClusterKey()), simThreshold); | |
} | |
// create a super cluster. | |
// visit and merge. | |
public List<ActivitySuperCluster> cluster(ISimilarity<ActivityWordsCategory> simFunc, double simThreshold) throws Exception { | |
List<ActivitySuperCluster> merged = new ArrayList<>(); | |
Announce.message("#initial clusters = " + potentiallyMergeableClusters.size()); | |
//Progress p = new Progress(10); | |
for (CSKCluster<Integer, Integer> c1 : potentiallyMergeableClusters) { | |
//p.next(); | |
// already associated with another supercluster? | |
if (visited.get(c1)) | |
continue; | |
visited.put(c1, true); | |
// initialize the cluster. | |
ActivitySuperCluster unmerged = new ActivitySuperCluster(); | |
unmerged.addCluster(c1); | |
for (CSKCluster<Integer, Integer> c2 : potentiallyMergeableClusters) { | |
if (visited.get(c2)) | |
continue; | |
// Should c1 absorb the little cluster c2? | |
// TODO: think about a thresholding algorithm? | |
// TODO: think about early pruning to avoid canMergeWith for | |
// every pair. | |
if (canMergeWith(c1, c2, simFunc, simThreshold)) { | |
unmerged.addCluster(c2); | |
visited.put(c2, true); | |
} | |
} | |
// We are done with c1. No other cluster wants to associate with it | |
merged.add(unmerged); | |
} | |
return merged; | |
} | |
public static class ActivitySuperCluster { | |
// bunch of strongly normalized triples make the key. | |
private List<Integer> superClusterKeys; | |
// a list of all members of the supercluster | |
private List<Integer> superClusterMembers; | |
public List<Integer> getSuperClusterKeys() { | |
return superClusterKeys; | |
} | |
public List<Integer> getSuperClusterMembers() { | |
return superClusterMembers; | |
} | |
public ActivitySuperCluster() { | |
this.superClusterKeys = new ArrayList<>(); | |
this.superClusterMembers = new ArrayList<>(); | |
} | |
public void addCluster(CSKCluster<Integer, Integer> c) { | |
this.superClusterKeys.add(c.getClusterKey()); | |
for (Integer smallClusterMember : c.getClusterMembers()) | |
this.superClusterMembers.add(smallClusterMember); | |
} | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result + ((superClusterKeys == null) ? 0 : superClusterKeys.hashCode()); | |
return result; | |
} | |
@Override | |
public boolean equals(Object obj) { | |
if (this == obj) | |
return true; | |
if (obj == null) | |
return false; | |
if (getClass() != obj.getClass()) | |
return false; | |
ActivitySuperCluster other = (ActivitySuperCluster) obj; | |
if (superClusterKeys == null) { | |
if (other.superClusterKeys != null) | |
return false; | |
} else if (!superClusterKeys.equals(other.superClusterKeys)) | |
return false; | |
return true; | |
} | |
@Override | |
public String toString() { | |
return superClusterKeys.size() + "\tCSKSuperCluster [superClusterKeys=" + superClusterKeys | |
+ ", superClusterMembers=" + superClusterMembers + "]"; | |
} | |
} | |
} |