From ac4b2952370cd8352c331a0f30882a51d2885f49 Mon Sep 17 00:00:00 2001 From: cxchu Date: Thu, 30 Mar 2017 10:33:27 +0200 Subject: [PATCH] Update clustering --- pom.xml | 57 +++++++++++++++---- .../HeuristicBottomupClustering.java | 7 ++- .../howtokb/tools/InformationExtraction.java | 54 ++++++++++++++++++ .../kb/howtokb/utils/SQLiteJDBCConnector.java | 2 +- .../java/kb/howtokb/TaskFrameReaderTest.java | 18 +++++- .../howtokb/TextToWikiHowTaskFrameTest.java | 8 +-- .../HeuristicBottomUpClusteringTest.java | 42 ++++++++------ 7 files changed, 151 insertions(+), 37 deletions(-) diff --git a/pom.xml b/pom.xml index f96eb31..aa320ac 100644 --- a/pom.xml +++ b/pom.xml @@ -1,13 +1,13 @@ - 4.0.0 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 - kb - howtokb - 0.0.1-SNAPSHOT - jar - - + kb + howtokb + 0.0.1-SNAPSHOT + jar + + src @@ -33,7 +33,6 @@ - package @@ -44,7 +43,7 @@ - test.java.kb.howtokb.TextToWikiHowTaskFrameTest + kb.howtokb.clustering.HeuristicBottomUpClusteringTest @@ -54,4 +53,42 @@ + + 1.7 + 1.7 + + + + com.googlecode.json-simple + json-simple + 1.1.1 + + + org.jsoup + jsoup + 1.9.1 + + + edu.stanford.nlp + stanford-corenlp + 3.3.0 + + + edu.washington.cs.knowitall.openie + openie_2.10 + 4.2.1 + + + colt + colt + 1.2.0 + + + niket.tools + javatools + 1.0.0 + + + + diff --git a/src/main/java/kb/howtokb/clustering/HeuristicBottomupClustering.java b/src/main/java/kb/howtokb/clustering/HeuristicBottomupClustering.java index 4c3fd59..c4d48f9 100644 --- a/src/main/java/kb/howtokb/clustering/HeuristicBottomupClustering.java +++ b/src/main/java/kb/howtokb/clustering/HeuristicBottomupClustering.java @@ -1,8 +1,9 @@ package kb.howtokb.clustering; import java.io.BufferedReader; -import java.io.FileReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -81,7 +82,9 @@ public HeuristicBottomupClustering(Map activityT } private void loadElemsFromDb(String activityTb) throws IOException { - try (BufferedReader br = new BufferedReader(new FileReader(activityTb))) { + ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); + InputStream inputs = classLoader.getResourceAsStream(activityTb); + try (BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) { String sCurrentLine; while ((sCurrentLine = br.readLine()) != null) { String [] line = sCurrentLine.split("\t"); diff --git a/src/main/java/kb/howtokb/tools/InformationExtraction.java b/src/main/java/kb/howtokb/tools/InformationExtraction.java index a59459d..90806d0 100644 --- a/src/main/java/kb/howtokb/tools/InformationExtraction.java +++ b/src/main/java/kb/howtokb/tools/InformationExtraction.java @@ -11,6 +11,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; @@ -18,7 +19,9 @@ import kb.howtokb.reader.TaskFrameReader; import kb.howtokb.taskframe.WikiHowTaskFrame; +import kb.howtokb.utils.AutoMap; import kb.howtokb.utils.SQLiteJDBCConnector; +import kb.howtokb.wkhobject.Category_Json; public class InformationExtraction { @@ -141,6 +144,57 @@ public static List getAllFrame(String inputfile) throws IOExce return allframe; } + // Get all activity frame in a same category + /** + * Get all frame which have the same category parent + * @param allframe + * @param category + * @return + * @throws Exception + */ + public List getFrameFromCategory(List allframe, String category) throws Exception { + System.out.println("Get a list of activities in a same category " + category); + List list = new ArrayList<>(); + List listCate = InformationExtraction.getListofAllChildren(InformationExtraction.getCategoryID(category)); + + for (int i = 0; i < allframe.size(); i++) { + if (listCate.contains(Integer.parseInt(allframe.get(i).getActivity().getCategoryID()))) { + list.add(allframe.get(i)); + } + } + System.out.println("Number of activities in a same category: " + list.size()); + return list; + } + + // get all children of a category + /** + * Get all categories which are children of a given category + * @param id + * @return + * @throws SQLException + * @throws ClassNotFoundException + * @throws IOException + */ + public static List getListofAllChildren(int id) throws SQLException, ClassNotFoundException, IOException { + Map> parentChains = new AutoMap<>(); + // "rootpath":[57,54,52,150,1] + ResultSet rs = SQLiteJDBCConnector.q("select id, json from categoryjson"); + while (rs.next()) { + try { + parentChains.put(rs.getInt(1), Category_Json.fromJson(rs.getString(2)).getRootpath()); + } catch (Exception e) { + System.out.print("\n---- JSONException in category: " + rs.getInt(1)); + } + } + List res = new ArrayList<>(); + for (Entry> e : parentChains.entrySet()) { + if (e.getValue().contains(id)) + if (!res.contains(e.getKey())) + res.add(e.getKey()); + } + return res; + } + /** * get map from a file * @param inputfile diff --git a/src/main/java/kb/howtokb/utils/SQLiteJDBCConnector.java b/src/main/java/kb/howtokb/utils/SQLiteJDBCConnector.java index f2bc130..785a3e7 100644 --- a/src/main/java/kb/howtokb/utils/SQLiteJDBCConnector.java +++ b/src/main/java/kb/howtokb/utils/SQLiteJDBCConnector.java @@ -75,7 +75,7 @@ public static void createDB() throws SQLException, ClassNotFoundException, IOExc rs = st.executeQuery("select json from categoryjson where id=1;"); if (!rs.next()){ - String input = "/var/tmp/cxchu/data-server/For-Database/wikihow-id-category.json"; + String input = "/var/tmp/cxchu/wikihow-id-category.json"; System.out.println("Updating data into table 'category'....."); update(st, "categoryjson", input); } diff --git a/src/test/java/kb/howtokb/TaskFrameReaderTest.java b/src/test/java/kb/howtokb/TaskFrameReaderTest.java index 70f1ed4..e8e0715 100644 --- a/src/test/java/kb/howtokb/TaskFrameReaderTest.java +++ b/src/test/java/kb/howtokb/TaskFrameReaderTest.java @@ -1,6 +1,10 @@ package kb.howtokb; +import java.io.BufferedWriter; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; import java.util.ArrayList; import org.json.simple.parser.ParseException; @@ -10,15 +14,25 @@ public class TaskFrameReaderTest { public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException { - String input = "/var/tmp/cxchu/act-frame-test.json"; + String input = "/var/tmp/cxchu/data-wordnet/act-frame.json"; ArrayList allframe = TaskFrameReader.extractWikiHowTaskFrameFromJSONFile(input); + Writer textout = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream("/var/tmp/cxchu/data-wordnet/act-frame-wikihow-task.json"), "utf-8")); + for (WikiHowTaskFrame f: allframe){ - System.out.println(f.toString()); + + if (f.getActivity().getSubActivities().size() > 0){ + textout.write(f.toJsonObject().toJSONString() + "\n"); + } + + //System.out.println(f.toString()); } System.out.println("Total of frames: " + allframe.size()); + textout.close(); + } } diff --git a/src/test/java/kb/howtokb/TextToWikiHowTaskFrameTest.java b/src/test/java/kb/howtokb/TextToWikiHowTaskFrameTest.java index a37d707..1bfa35c 100644 --- a/src/test/java/kb/howtokb/TextToWikiHowTaskFrameTest.java +++ b/src/test/java/kb/howtokb/TextToWikiHowTaskFrameTest.java @@ -19,19 +19,19 @@ public class TextToWikiHowTaskFrameTest { public static void main(String[] args) throws ClassNotFoundException, IOException, ParseException { - System.setOut(new PrintStream(new FileOutputStream("log.txt"))); + //System.setOut(new PrintStream(new FileOutputStream("log.txt"))); TextToWikiHowTaskFrame extract = new TextToWikiHowTaskFrame(); // Extract all question System.out.println("Reading json data file....."); - String input = "/var/tmp/cxchu/data-test/articles_test.json"; + String input = "/var/tmp/cxchu/data-for-test-code/articles_test.json"; ArrayList allQuestions = WikiHowArticleReader.WikiHowArticleReaderFromJSONFile(input); int frames = 0; try { Writer textout = new BufferedWriter(new OutputStreamWriter( - new FileOutputStream("/var/tmp/cxchu/data-test/act-frame.json"), "utf-8")); + new FileOutputStream("/var/tmp/cxchu/data-for-test-code/act-frame.json"), "utf-8")); Writer idtextout = new BufferedWriter(new OutputStreamWriter( - new FileOutputStream("/var/tmp/cxchu/data-test/id-act-frame.json"), "utf-8")); + new FileOutputStream("/var/tmp/cxchu/data-for-test-code/id-act-frame.json"), "utf-8")); int i = 1; diff --git a/src/test/java/kb/howtokb/clustering/HeuristicBottomUpClusteringTest.java b/src/test/java/kb/howtokb/clustering/HeuristicBottomUpClusteringTest.java index 3672209..c6b1146 100644 --- a/src/test/java/kb/howtokb/clustering/HeuristicBottomUpClusteringTest.java +++ b/src/test/java/kb/howtokb/clustering/HeuristicBottomUpClusteringTest.java @@ -1,9 +1,15 @@ package kb.howtokb.clustering; +import java.io.BufferedWriter; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; import java.util.List; import kb.howtokb.clustering.HeuristicBottomupClustering.ActivitySuperCluster; import kb.howtokb.clustering.sim.Coefficient; +import kb.howtokb.taskframe.WikiHowTaskFrame; +import kb.howtokb.tools.InformationExtraction; public class HeuristicBottomUpClusteringTest { public static void main(String[] args) throws Exception { @@ -11,7 +17,7 @@ public static void main(String[] args) throws Exception { long startTime = System.currentTimeMillis(); - String activityTb = "resources/all-words-category.txt"; + String activityTb = "all-words-category.txt"; HeuristicBottomupClustering cluster = new HeuristicBottomupClustering(activityTb); double threshold = Coefficient.VVNN_TRHES; @@ -20,27 +26,27 @@ public static void main(String[] args) throws Exception { SimplePruningSimilarity simFunc = new SimplePruningSimilarity(threshold, model, allAct); List results = cluster.cluster(simFunc, Coefficient.VVNN_TRHES); System.out.println("Number of clusters: " + results.size()); -// String output = "/var/tmp/cxchu/clustering-result/bottom-up-cluster-"; + String output = "/var/tmp/cxchu/clustering-result-wikihow-task/bottom-up-cluster-"; -// String input = "/var/tmp/cxchu/act-frame-test.json"; //original data point file -// List allframe = InformationExtraction.getAllFrame(input); + String input = "/var/tmp/cxchu/data-server/For-Database/act-frame-wikihow-task.json"; //original data point file + List allframe = InformationExtraction.getAllFrame(input); int total = 0; for (int i = 0; i < results.size(); i++) { System.out.println("Cluster " + i + ": " + results.get(i).getSuperClusterMembers().size()); -// Writer out = new BufferedWriter(new OutputStreamWriter( -// new FileOutputStream(output+i+".json"), "utf-8")); -// List actitiviesID = -// results.get(i).getSuperClusterMembers(); -// for (int j=0; j actitiviesID = + results.get(i).getSuperClusterMembers(); + for (int j=0; j