Skip to content
Permalink
Browse files

Update clustering

  • Loading branch information
cxchu
cxchu committed Mar 30, 2017
1 parent 1bb5d45 commit ac4b2952370cd8352c331a0f30882a51d2885f49
57 pom.xml
@@ -1,13 +1,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>kb</groupId>
<artifactId>howtokb</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<build>
<groupId>kb</groupId>
<artifactId>howtokb</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<build>
<sourceDirectory>src</sourceDirectory>
<plugins>
<plugin>
@@ -33,7 +33,6 @@
</descriptorRefs>
</configuration>


<executions>
<execution>
<phase>package</phase>
@@ -44,7 +43,7 @@
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>test.java.kb.howtokb.TextToWikiHowTaskFrameTest</mainClass>
<mainClass>kb.howtokb.clustering.HeuristicBottomUpClusteringTest</mainClass>
</transformer>
</transformers>
</configuration>
@@ -54,4 +53,42 @@
</plugin>
</plugins>
</build>
<properties>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.1</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>edu.washington.cs.knowitall.openie</groupId>
<artifactId>openie_2.10</artifactId>
<version>4.2.1</version>
</dependency>
<dependency>
<groupId>colt</groupId>
<artifactId>colt</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>niket.tools</groupId>
<artifactId>javatools</artifactId>
<version>1.0.0</version>
</dependency>

</dependencies>

</project>
@@ -1,8 +1,9 @@
package kb.howtokb.clustering;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -81,7 +82,9 @@ public class HeuristicBottomupClustering implements IBottomUpClustering<CSKClust
}

private void loadElemsFromDb(String activityTb) throws IOException {
try (BufferedReader br = new BufferedReader(new FileReader(activityTb))) {
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream(activityTb);
try (BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
String [] line = sCurrentLine.split("\t");
@@ -11,14 +11,17 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

import kb.howtokb.reader.TaskFrameReader;
import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.utils.AutoMap;
import kb.howtokb.utils.SQLiteJDBCConnector;
import kb.howtokb.wkhobject.Category_Json;

public class InformationExtraction {

@@ -141,6 +144,57 @@ public class InformationExtraction {
return allframe;
}

// Get all activity frame in a same category
/**
* Get all frame which have the same category parent
* @param allframe
* @param category
* @return
* @throws Exception
*/
public List<WikiHowTaskFrame> getFrameFromCategory(List<WikiHowTaskFrame> allframe, String category) throws Exception {
System.out.println("Get a list of activities in a same category " + category);
List<WikiHowTaskFrame> list = new ArrayList<>();
List<Integer> listCate = InformationExtraction.getListofAllChildren(InformationExtraction.getCategoryID(category));

for (int i = 0; i < allframe.size(); i++) {
if (listCate.contains(Integer.parseInt(allframe.get(i).getActivity().getCategoryID()))) {
list.add(allframe.get(i));
}
}
System.out.println("Number of activities in a same category: " + list.size());
return list;
}

// get all children of a category
/**
* Get all categories which are children of a given category
* @param id
* @return
* @throws SQLException
* @throws ClassNotFoundException
* @throws IOException
*/
public static List<Integer> getListofAllChildren(int id) throws SQLException, ClassNotFoundException, IOException {
Map<Integer, List<Integer>> parentChains = new AutoMap<>();
// "rootpath":[57,54,52,150,1]
ResultSet rs = SQLiteJDBCConnector.q("select id, json from categoryjson");
while (rs.next()) {
try {
parentChains.put(rs.getInt(1), Category_Json.fromJson(rs.getString(2)).getRootpath());
} catch (Exception e) {
System.out.print("\n---- JSONException in category: " + rs.getInt(1));
}
}
List<Integer> res = new ArrayList<>();
for (Entry<Integer, List<Integer>> e : parentChains.entrySet()) {
if (e.getValue().contains(id))
if (!res.contains(e.getKey()))
res.add(e.getKey());
}
return res;
}

/**
* get map <id, activity frame> from a file
* @param inputfile
@@ -75,7 +75,7 @@ public class SQLiteJDBCConnector {

rs = st.executeQuery("select json from categoryjson where id=1;");
if (!rs.next()){
String input = "/var/tmp/cxchu/data-server/For-Database/wikihow-id-category.json";
String input = "/var/tmp/cxchu/wikihow-id-category.json";
System.out.println("Updating data into table 'category'.....");
update(st, "categoryjson", input);
}
@@ -1,6 +1,10 @@
package kb.howtokb;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;

import org.json.simple.parser.ParseException;
@@ -10,15 +14,25 @@ import kb.howtokb.taskframe.WikiHowTaskFrame;

public class TaskFrameReaderTest {
public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException {
String input = "/var/tmp/cxchu/act-frame-test.json";
String input = "/var/tmp/cxchu/data-wordnet/act-frame.json";

ArrayList<WikiHowTaskFrame> allframe = TaskFrameReader.extractWikiHowTaskFrameFromJSONFile(input);

Writer textout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/data-wordnet/act-frame-wikihow-task.json"), "utf-8"));

for (WikiHowTaskFrame f: allframe){
System.out.println(f.toString());

if (f.getActivity().getSubActivities().size() > 0){
textout.write(f.toJsonObject().toJSONString() + "\n");
}

//System.out.println(f.toString());
}

System.out.println("Total of frames: " + allframe.size());

textout.close();

}
}
@@ -19,19 +19,19 @@ import kb.howtokb.wkhobject.Question;
public class TextToWikiHowTaskFrameTest {

public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException {
System.setOut(new PrintStream(new FileOutputStream("log.txt")));
//System.setOut(new PrintStream(new FileOutputStream("log.txt")));

TextToWikiHowTaskFrame extract = new TextToWikiHowTaskFrame();
// Extract all question
System.out.println("Reading json data file.....");
String input = "/var/tmp/cxchu/data-test/articles_test.json";
String input = "/var/tmp/cxchu/data-for-test-code/articles_test.json";
ArrayList<Question> allQuestions = WikiHowArticleReader.WikiHowArticleReaderFromJSONFile(input);
int frames = 0;
try {
Writer textout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/data-test/act-frame.json"), "utf-8"));
new FileOutputStream("/var/tmp/cxchu/data-for-test-code/act-frame.json"), "utf-8"));
Writer idtextout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/data-test/id-act-frame.json"), "utf-8"));
new FileOutputStream("/var/tmp/cxchu/data-for-test-code/id-act-frame.json"), "utf-8"));

int i = 1;

@@ -1,17 +1,23 @@
package kb.howtokb.clustering;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;

import kb.howtokb.clustering.HeuristicBottomupClustering.ActivitySuperCluster;
import kb.howtokb.clustering.sim.Coefficient;
import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.tools.InformationExtraction;

public class HeuristicBottomUpClusteringTest {
public static void main(String[] args) throws Exception {

long startTime = System.currentTimeMillis();


String activityTb = "resources/all-words-category.txt";
String activityTb = "all-words-category.txt";
HeuristicBottomupClustering cluster = new HeuristicBottomupClustering(activityTb);

double threshold = Coefficient.VVNN_TRHES;
@@ -20,27 +26,27 @@ public class HeuristicBottomUpClusteringTest {
SimplePruningSimilarity simFunc = new SimplePruningSimilarity(threshold, model, allAct);
List<ActivitySuperCluster> results = cluster.cluster(simFunc, Coefficient.VVNN_TRHES);
System.out.println("Number of clusters: " + results.size());
// String output = "/var/tmp/cxchu/clustering-result/bottom-up-cluster-";
String output = "/var/tmp/cxchu/clustering-result-wikihow-task/bottom-up-cluster-";

// String input = "/var/tmp/cxchu/act-frame-test.json"; //original data point file
// List<WikiHowTaskFrame> allframe = InformationExtraction.getAllFrame(input);
String input = "/var/tmp/cxchu/data-server/For-Database/act-frame-wikihow-task.json"; //original data point file
List<WikiHowTaskFrame> allframe = InformationExtraction.getAllFrame(input);
int total = 0;
for (int i = 0; i < results.size(); i++) {
System.out.println("Cluster " + i + ": " + results.get(i).getSuperClusterMembers().size());
// Writer out = new BufferedWriter(new OutputStreamWriter(
// new FileOutputStream(output+i+".json"), "utf-8"));
// List<Integer> actitiviesID =
// results.get(i).getSuperClusterMembers();
// for (int j=0; j<allframe.size(); j++){
// if (actitiviesID.contains(allframe.get(j).getID())){
// out.write(allframe.get(j).toJsonObject().toJSONString() + "\n");
// allframe.remove(j);
// j--;
// total++;
// }
// }
//
// out.close();
Writer out = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(output+i+".json"), "utf-8"));
List<Integer> actitiviesID =
results.get(i).getSuperClusterMembers();
for (int j=0; j<allframe.size(); j++){
if (actitiviesID.contains(allframe.get(j).getID())){
out.write(allframe.get(j).toJsonObject().toJSONString() + "\n");
allframe.remove(j);
j--;
total++;
}
}

out.close();
}

long endTime = System.currentTimeMillis();

0 comments on commit ac4b295

Please sign in to comment.
You can’t perform that action at this time.