Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Update clustering
  • Loading branch information
cxchu committed Mar 30, 2017
1 parent 1bb5d45 commit ac4b295
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 37 deletions.
57 changes: 47 additions & 10 deletions pom.xml
@@ -1,13 +1,13 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>


<groupId>kb</groupId> <groupId>kb</groupId>
<artifactId>howtokb</artifactId> <artifactId>howtokb</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>

<build> <build>
<sourceDirectory>src</sourceDirectory> <sourceDirectory>src</sourceDirectory>
<plugins> <plugins>
<plugin> <plugin>
Expand All @@ -33,7 +33,6 @@
</descriptorRefs> </descriptorRefs>
</configuration> </configuration>



<executions> <executions>
<execution> <execution>
<phase>package</phase> <phase>package</phase>
Expand All @@ -44,7 +43,7 @@
<transformers> <transformers>
<transformer <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>test.java.kb.howtokb.TextToWikiHowTaskFrameTest</mainClass> <mainClass>kb.howtokb.clustering.HeuristicBottomUpClusteringTest</mainClass>
</transformer> </transformer>
</transformers> </transformers>
</configuration> </configuration>
Expand All @@ -54,4 +53,42 @@
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<properties>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.1</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>edu.washington.cs.knowitall.openie</groupId>
<artifactId>openie_2.10</artifactId>
<version>4.2.1</version>
</dependency>
<dependency>
<groupId>colt</groupId>
<artifactId>colt</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>niket.tools</groupId>
<artifactId>javatools</artifactId>
<version>1.0.0</version>
</dependency>

</dependencies>

</project> </project>
@@ -1,8 +1,9 @@
package kb.howtokb.clustering; package kb.howtokb.clustering;


import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
Expand Down Expand Up @@ -81,7 +82,9 @@ public HeuristicBottomupClustering(Map<Integer, ActivityWordsCategory> activityT
} }


private void loadElemsFromDb(String activityTb) throws IOException { private void loadElemsFromDb(String activityTb) throws IOException {
try (BufferedReader br = new BufferedReader(new FileReader(activityTb))) { ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream(activityTb);
try (BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
String sCurrentLine; String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) { while ((sCurrentLine = br.readLine()) != null) {
String [] line = sCurrentLine.split("\t"); String [] line = sCurrentLine.split("\t");
Expand Down
54 changes: 54 additions & 0 deletions src/main/java/kb/howtokb/tools/InformationExtraction.java
Expand Up @@ -11,14 +11,17 @@
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;


import org.json.simple.JSONObject; import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser; import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException; import org.json.simple.parser.ParseException;


import kb.howtokb.reader.TaskFrameReader; import kb.howtokb.reader.TaskFrameReader;
import kb.howtokb.taskframe.WikiHowTaskFrame; import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.utils.AutoMap;
import kb.howtokb.utils.SQLiteJDBCConnector; import kb.howtokb.utils.SQLiteJDBCConnector;
import kb.howtokb.wkhobject.Category_Json;


public class InformationExtraction { public class InformationExtraction {


Expand Down Expand Up @@ -141,6 +144,57 @@ public static List<WikiHowTaskFrame> getAllFrame(String inputfile) throws IOExce
return allframe; return allframe;
} }


// Get all activity frame in a same category
/**
* Get all frame which have the same category parent
* @param allframe
* @param category
* @return
* @throws Exception
*/
public List<WikiHowTaskFrame> getFrameFromCategory(List<WikiHowTaskFrame> allframe, String category) throws Exception {
System.out.println("Get a list of activities in a same category " + category);
List<WikiHowTaskFrame> list = new ArrayList<>();
List<Integer> listCate = InformationExtraction.getListofAllChildren(InformationExtraction.getCategoryID(category));

for (int i = 0; i < allframe.size(); i++) {
if (listCate.contains(Integer.parseInt(allframe.get(i).getActivity().getCategoryID()))) {
list.add(allframe.get(i));
}
}
System.out.println("Number of activities in a same category: " + list.size());
return list;
}

// get all children of a category
/**
* Get all categories which are children of a given category
* @param id
* @return
* @throws SQLException
* @throws ClassNotFoundException
* @throws IOException
*/
public static List<Integer> getListofAllChildren(int id) throws SQLException, ClassNotFoundException, IOException {
Map<Integer, List<Integer>> parentChains = new AutoMap<>();
// "rootpath":[57,54,52,150,1]
ResultSet rs = SQLiteJDBCConnector.q("select id, json from categoryjson");
while (rs.next()) {
try {
parentChains.put(rs.getInt(1), Category_Json.fromJson(rs.getString(2)).getRootpath());
} catch (Exception e) {
System.out.print("\n---- JSONException in category: " + rs.getInt(1));
}
}
List<Integer> res = new ArrayList<>();
for (Entry<Integer, List<Integer>> e : parentChains.entrySet()) {
if (e.getValue().contains(id))
if (!res.contains(e.getKey()))
res.add(e.getKey());
}
return res;
}

/** /**
* get map <id, activity frame> from a file * get map <id, activity frame> from a file
* @param inputfile * @param inputfile
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/kb/howtokb/utils/SQLiteJDBCConnector.java
Expand Up @@ -75,7 +75,7 @@ public static void createDB() throws SQLException, ClassNotFoundException, IOExc


rs = st.executeQuery("select json from categoryjson where id=1;"); rs = st.executeQuery("select json from categoryjson where id=1;");
if (!rs.next()){ if (!rs.next()){
String input = "/var/tmp/cxchu/data-server/For-Database/wikihow-id-category.json"; String input = "/var/tmp/cxchu/wikihow-id-category.json";
System.out.println("Updating data into table 'category'....."); System.out.println("Updating data into table 'category'.....");
update(st, "categoryjson", input); update(st, "categoryjson", input);
} }
Expand Down
18 changes: 16 additions & 2 deletions src/test/java/kb/howtokb/TaskFrameReaderTest.java
@@ -1,6 +1,10 @@
package kb.howtokb; package kb.howtokb;


import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList; import java.util.ArrayList;


import org.json.simple.parser.ParseException; import org.json.simple.parser.ParseException;
Expand All @@ -10,15 +14,25 @@


public class TaskFrameReaderTest { public class TaskFrameReaderTest {
public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException { public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException {
String input = "/var/tmp/cxchu/act-frame-test.json"; String input = "/var/tmp/cxchu/data-wordnet/act-frame.json";


ArrayList<WikiHowTaskFrame> allframe = TaskFrameReader.extractWikiHowTaskFrameFromJSONFile(input); ArrayList<WikiHowTaskFrame> allframe = TaskFrameReader.extractWikiHowTaskFrameFromJSONFile(input);


Writer textout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/data-wordnet/act-frame-wikihow-task.json"), "utf-8"));

for (WikiHowTaskFrame f: allframe){ for (WikiHowTaskFrame f: allframe){
System.out.println(f.toString());
if (f.getActivity().getSubActivities().size() > 0){
textout.write(f.toJsonObject().toJSONString() + "\n");
}

//System.out.println(f.toString());
} }


System.out.println("Total of frames: " + allframe.size()); System.out.println("Total of frames: " + allframe.size());


textout.close();

} }
} }
8 changes: 4 additions & 4 deletions src/test/java/kb/howtokb/TextToWikiHowTaskFrameTest.java
Expand Up @@ -19,19 +19,19 @@
public class TextToWikiHowTaskFrameTest { public class TextToWikiHowTaskFrameTest {


public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException { public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException {
System.setOut(new PrintStream(new FileOutputStream("log.txt"))); //System.setOut(new PrintStream(new FileOutputStream("log.txt")));


TextToWikiHowTaskFrame extract = new TextToWikiHowTaskFrame(); TextToWikiHowTaskFrame extract = new TextToWikiHowTaskFrame();
// Extract all question // Extract all question
System.out.println("Reading json data file....."); System.out.println("Reading json data file.....");
String input = "/var/tmp/cxchu/data-test/articles_test.json"; String input = "/var/tmp/cxchu/data-for-test-code/articles_test.json";
ArrayList<Question> allQuestions = WikiHowArticleReader.WikiHowArticleReaderFromJSONFile(input); ArrayList<Question> allQuestions = WikiHowArticleReader.WikiHowArticleReaderFromJSONFile(input);
int frames = 0; int frames = 0;
try { try {
Writer textout = new BufferedWriter(new OutputStreamWriter( Writer textout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/data-test/act-frame.json"), "utf-8")); new FileOutputStream("/var/tmp/cxchu/data-for-test-code/act-frame.json"), "utf-8"));
Writer idtextout = new BufferedWriter(new OutputStreamWriter( Writer idtextout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("/var/tmp/cxchu/data-test/id-act-frame.json"), "utf-8")); new FileOutputStream("/var/tmp/cxchu/data-for-test-code/id-act-frame.json"), "utf-8"));


int i = 1; int i = 1;


Expand Down
@@ -1,17 +1,23 @@
package kb.howtokb.clustering; package kb.howtokb.clustering;


import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List; import java.util.List;


import kb.howtokb.clustering.HeuristicBottomupClustering.ActivitySuperCluster; import kb.howtokb.clustering.HeuristicBottomupClustering.ActivitySuperCluster;
import kb.howtokb.clustering.sim.Coefficient; import kb.howtokb.clustering.sim.Coefficient;
import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.tools.InformationExtraction;


public class HeuristicBottomUpClusteringTest { public class HeuristicBottomUpClusteringTest {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {


long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();




String activityTb = "resources/all-words-category.txt"; String activityTb = "all-words-category.txt";
HeuristicBottomupClustering cluster = new HeuristicBottomupClustering(activityTb); HeuristicBottomupClustering cluster = new HeuristicBottomupClustering(activityTb);


double threshold = Coefficient.VVNN_TRHES; double threshold = Coefficient.VVNN_TRHES;
Expand All @@ -20,27 +26,27 @@ public static void main(String[] args) throws Exception {
SimplePruningSimilarity simFunc = new SimplePruningSimilarity(threshold, model, allAct); SimplePruningSimilarity simFunc = new SimplePruningSimilarity(threshold, model, allAct);
List<ActivitySuperCluster> results = cluster.cluster(simFunc, Coefficient.VVNN_TRHES); List<ActivitySuperCluster> results = cluster.cluster(simFunc, Coefficient.VVNN_TRHES);
System.out.println("Number of clusters: " + results.size()); System.out.println("Number of clusters: " + results.size());
// String output = "/var/tmp/cxchu/clustering-result/bottom-up-cluster-"; String output = "/var/tmp/cxchu/clustering-result-wikihow-task/bottom-up-cluster-";


// String input = "/var/tmp/cxchu/act-frame-test.json"; //original data point file String input = "/var/tmp/cxchu/data-server/For-Database/act-frame-wikihow-task.json"; //original data point file
// List<WikiHowTaskFrame> allframe = InformationExtraction.getAllFrame(input); List<WikiHowTaskFrame> allframe = InformationExtraction.getAllFrame(input);
int total = 0; int total = 0;
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
System.out.println("Cluster " + i + ": " + results.get(i).getSuperClusterMembers().size()); System.out.println("Cluster " + i + ": " + results.get(i).getSuperClusterMembers().size());
// Writer out = new BufferedWriter(new OutputStreamWriter( Writer out = new BufferedWriter(new OutputStreamWriter(
// new FileOutputStream(output+i+".json"), "utf-8")); new FileOutputStream(output+i+".json"), "utf-8"));
// List<Integer> actitiviesID = List<Integer> actitiviesID =
// results.get(i).getSuperClusterMembers(); results.get(i).getSuperClusterMembers();
// for (int j=0; j<allframe.size(); j++){ for (int j=0; j<allframe.size(); j++){
// if (actitiviesID.contains(allframe.get(j).getID())){ if (actitiviesID.contains(allframe.get(j).getID())){
// out.write(allframe.get(j).toJsonObject().toJSONString() + "\n"); out.write(allframe.get(j).toJsonObject().toJSONString() + "\n");
// allframe.remove(j); allframe.remove(j);
// j--; j--;
// total++; total++;
// } }
// } }
//
// out.close(); out.close();
} }


long endTime = System.currentTimeMillis(); long endTime = System.currentTimeMillis();
Expand Down

0 comments on commit ac4b295

Please sign in to comment.