Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Update WikiHow Task Frame Reader
- Loading branch information
Showing
12 changed files
with
172,714 additions
and
250 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
140 changes: 140 additions & 0 deletions
140
src/main/java/kb/howtokb/extractor/TaskFrameReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
package kb.howtokb.extractor; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
|
||
import org.json.simple.JSONArray; | ||
import org.json.simple.JSONObject; | ||
import org.json.simple.parser.JSONParser; | ||
import org.json.simple.parser.ParseException; | ||
|
||
import kb.howtokb.taskframe.WikiHowTask; | ||
import kb.howtokb.taskframe.WikiHowTaskFrame; | ||
|
||
public class TaskFrameReader { | ||
|
||
static JSONParser parser = new JSONParser(); | ||
|
||
// Read json file and return all activity frame object | ||
// TO DO | ||
// ====================================================================== | ||
/** | ||
* Return list of activity frame extract from file with json format | ||
* | ||
* @param directory | ||
* that saves activity frame in json format | ||
* @return a list of activity frame | ||
* @throws IOException | ||
* @throws ClassNotFoundException | ||
* @throws ParseException | ||
*/ | ||
public static ArrayList<WikiHowTaskFrame> extractWikiHowTaskFrameFromJSONFile(String directory) | ||
throws IOException, ClassNotFoundException, ParseException { | ||
ArrayList<WikiHowTaskFrame> allframe = new ArrayList<>(); | ||
|
||
try (BufferedReader br = new BufferedReader(new FileReader(directory))) { | ||
|
||
String sCurrentLine; | ||
|
||
while ((sCurrentLine = br.readLine()) != null) { | ||
WikiHowTaskFrame newframe = jsonToWikiHowTaskFrame(sCurrentLine); | ||
allframe.add(newframe); | ||
} | ||
} | ||
return allframe; | ||
} | ||
|
||
//Read a string in json format and return an activity frame | ||
public static WikiHowTaskFrame jsonToWikiHowTaskFrame(String jsonStr) throws ParseException { | ||
return jsonToWikiHowTaskFrame((JSONObject) parser.parse(jsonStr)); | ||
} | ||
|
||
// Read an Json object to transfer to activity frame | ||
@SuppressWarnings("unchecked") | ||
public static WikiHowTaskFrame jsonToWikiHowTaskFrame(JSONObject obj) { | ||
// Extract id | ||
int idframe = (int) (long) obj.get("id"); | ||
|
||
// Extract list of location | ||
ArrayList<String> location = new ArrayList<>(); | ||
JSONArray locationJ = (JSONArray) obj.get("location"); | ||
Iterator<String> iteratorC = locationJ.iterator(); | ||
while (iteratorC.hasNext()) { | ||
location.add(iteratorC.next()); | ||
} | ||
|
||
// Extract list of temporal | ||
ArrayList<String> temporal = new ArrayList<>(); | ||
JSONArray temporalJ = (JSONArray) obj.get("temporal"); | ||
Iterator<String> iteratorT = temporalJ.iterator(); | ||
while (iteratorT.hasNext()) { | ||
temporal.add(iteratorT.next()); | ||
} | ||
|
||
// Extract list of part A | ||
ArrayList<String> partA = new ArrayList<>(); | ||
JSONArray partAJ = (JSONArray) obj.get("part-agent"); | ||
Iterator<String> iteratorA = partAJ.iterator(); | ||
while (iteratorA.hasNext()) { | ||
partA.add(iteratorA.next()); | ||
} | ||
|
||
// Extract list of part O | ||
ArrayList<String> partO = new ArrayList<>(); | ||
JSONArray partOJ = (JSONArray) obj.get("part-object"); | ||
Iterator<String> iteratorO = partOJ.iterator(); | ||
while (iteratorO.hasNext()) { | ||
partO.add(iteratorO.next()); | ||
} | ||
// Activity extraction | ||
|
||
JSONObject actJ = (JSONObject) obj.get("activity"); | ||
// id | ||
int id = (int) (long) actJ.get("id"); | ||
// verb | ||
String verb = (String) actJ.get("verb"); | ||
// object | ||
String object = (String) actJ.get("object"); | ||
// ori verb | ||
String oriverb = (String) actJ.get("ori-verb"); | ||
// ori-object | ||
String oriobject = (String) actJ.get("ori-object"); | ||
// image | ||
String image = (String) actJ.get("image"); | ||
// cate id | ||
String cateid = (String) actJ.get("categoryid"); | ||
// link id | ||
String linkid = (String) actJ.get("linkid"); | ||
// view | ||
int view = (int) (long) actJ.get("view"); | ||
// rate | ||
double rate = (double) (double) actJ.get("rating"); | ||
// cluster id | ||
int clusterid = (int) (long) actJ.get("clusterid"); | ||
// cluster name | ||
String clustername = (String) actJ.get("clustername"); | ||
// video | ||
String video = (String) actJ.get("video"); | ||
// parent | ||
String parent = (String) actJ.get("parent"); | ||
// prev | ||
String prev = (String) actJ.get("prev"); | ||
// next | ||
String next = (String) actJ.get("next"); | ||
// list of sub-activity | ||
ArrayList<Integer> children = new ArrayList<>(); | ||
JSONArray childrenJ = (JSONArray) actJ.get("sub-activity"); | ||
Iterator<Long> iteratorS = childrenJ.iterator(); | ||
while (iteratorS.hasNext()) { | ||
children.add((int) (long) iteratorS.next()); | ||
} | ||
|
||
WikiHowTask newact = new WikiHowTask(id, verb, object, oriverb, oriobject, cateid, linkid, rate, view, image, | ||
clusterid, clustername, video, children, parent, prev, next); | ||
|
||
return (new WikiHowTaskFrame(idframe, newact, location, temporal, partA, partO)); | ||
} | ||
} |
208 changes: 208 additions & 0 deletions
208
src/main/java/kb/howtokb/extractor/WikiHowArticleReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
package kb.howtokb.extractor; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.FileNotFoundException; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.io.ObjectInputStream; | ||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
import org.json.simple.JSONArray; | ||
import org.json.simple.JSONObject; | ||
import org.json.simple.parser.JSONParser; | ||
import org.json.simple.parser.ParseException; | ||
|
||
import kb.howtokb.global.Global; | ||
import kb.howtokb.wkhobject.Category; | ||
import kb.howtokb.wkhobject.Ingredients; | ||
import kb.howtokb.wkhobject.Method; | ||
import kb.howtokb.wkhobject.Part; | ||
import kb.howtokb.wkhobject.Question; | ||
import kb.howtokb.wkhobject.Step; | ||
import kb.howtokb.wkhobject.Things; | ||
|
||
public class WikiHowArticleReader { | ||
|
||
/** | ||
* Return list of articles extract from file with json format | ||
* @param directory that saves articles in json format | ||
* @return a list of articles | ||
* @throws IOException | ||
* @throws ClassNotFoundException | ||
* @throws ParseException | ||
*/ | ||
public static ArrayList<Question> WikiHowArticleReaderFromJSONFile(String directory) throws IOException, ClassNotFoundException, ParseException{ | ||
ArrayList<Question> allQuestions = new ArrayList<>(); | ||
|
||
JSONParser parser = new JSONParser(); | ||
try (BufferedReader br = new BufferedReader(new FileReader(directory))) { | ||
|
||
String sCurrentLine; | ||
|
||
while ((sCurrentLine = br.readLine()) != null) { | ||
Object obj = parser.parse(sCurrentLine); | ||
JSONObject jsonObject = (JSONObject) obj; | ||
Question newQuestion = jsonToQuestion(jsonObject); | ||
allQuestions.add(newQuestion); | ||
} | ||
} | ||
return allQuestions; | ||
} | ||
|
||
|
||
// ====================================================================== | ||
/** | ||
* Convert a Json object to Question Object | ||
* @param JSON Object | ||
* @return Question Object | ||
*/ | ||
@SuppressWarnings("unchecked") | ||
public static Question jsonToQuestion(JSONObject jsonobj){ | ||
//Extract link | ||
String link = (String) jsonobj.get("Link"); | ||
|
||
//Extract title | ||
String title = (String) jsonobj.get("Title"); | ||
|
||
//Extract explanation | ||
String exp = (String) jsonobj.get("Explanation"); | ||
|
||
//Extract views | ||
int views = (int) (long) jsonobj.get("Views"); | ||
|
||
//Extract rate | ||
double rate = (double) (double) jsonobj.get("Rate"); | ||
|
||
//Extract tips | ||
String tips = (String) jsonobj.get("Tips"); | ||
|
||
//Extract warnings | ||
String warnings = (String) jsonobj.get("Warnings"); | ||
|
||
//Extract link of video | ||
String video = (String) jsonobj.get("Video"); | ||
|
||
//Extract category | ||
ArrayList<Category> category = new ArrayList<Category>(); | ||
ArrayList<String> cate_string = new ArrayList<String>(); | ||
JSONArray categoryJ = (JSONArray) jsonobj.get("Category"); | ||
Iterator<String> iteratorC = categoryJ.iterator(); | ||
while (iteratorC.hasNext()) { | ||
cate_string.add(iteratorC.next()); | ||
} | ||
cate_string.add(0, ""); | ||
cate_string.add(cate_string.size(), ""); | ||
for (int k=1; k<cate_string.size()-1; k++){ | ||
Category newCate = new Category(cate_string.get(k), cate_string.get(k-1), cate_string.get(k+1)); | ||
category.add(newCate); | ||
} | ||
|
||
//Extract things | ||
ArrayList<Things> things = new ArrayList<Things>(); | ||
JSONArray thingJ = (JSONArray) jsonobj.get("Things"); | ||
Iterator<JSONObject> iteratorT = thingJ.iterator(); | ||
while (iteratorT.hasNext()) { | ||
JSONObject thing = iteratorT.next(); | ||
//Title of method | ||
String title_thing = (String) thing.get("Title"); | ||
//List of things of this method | ||
ArrayList<String> listthing = new ArrayList<String>(); | ||
//List things of a method | ||
JSONArray thing_method = (JSONArray) thing.get("Things"); | ||
Iterator<String> iteratorTM = thing_method.iterator(); | ||
while (iteratorTM.hasNext()) { | ||
listthing.add(iteratorTM.next()); | ||
} | ||
Things newThing = new Things(title_thing, listthing); | ||
things.add(newThing); | ||
} | ||
|
||
|
||
//Extract ingredients | ||
ArrayList<Ingredients> ingredients = new ArrayList<Ingredients>(); | ||
JSONArray ingredientJ = (JSONArray) jsonobj.get("Ingredients"); | ||
Iterator<JSONObject> iteratorI = ingredientJ.iterator(); | ||
while (iteratorI.hasNext()) { | ||
JSONObject ingre = iteratorI.next(); | ||
//Title of method | ||
String title_ingre = (String) ingre.get("Title"); | ||
//List of things of this method | ||
ArrayList<String> listingre = new ArrayList<String>(); | ||
//List things of a method | ||
JSONArray ingre_method = (JSONArray) ingre.get("Ingredients"); | ||
Iterator<String> iteratorIM = ingre_method.iterator(); | ||
while (iteratorIM.hasNext()) { | ||
listingre.add(iteratorIM.next()); | ||
} | ||
Ingredients newIngre = new Ingredients(title_ingre, listingre); | ||
ingredients.add(newIngre); | ||
} | ||
|
||
//Extract answer | ||
// loop array | ||
//Extract list of methods | ||
ArrayList<Method> answerJ = new ArrayList<Method>(); | ||
JSONArray answer = (JSONArray) jsonobj.get("Answer"); | ||
Iterator<JSONObject> iterator = answer.iterator(); | ||
while (iterator.hasNext()) { | ||
// loop method array | ||
JSONObject method = iterator.next(); | ||
//Extract name of method | ||
String title_method = (String) method.get("Title"); | ||
//Extract order of method | ||
int order_method = (int) (long) method.get("Order"); | ||
|
||
//extract list of part | ||
ArrayList<Part> listPart = new ArrayList<>(); | ||
JSONArray listofPart = (JSONArray) method.get("Method"); | ||
|
||
Iterator<JSONObject> iteratorPart = listofPart.iterator(); | ||
while(iteratorPart.hasNext()){ | ||
// loop part array | ||
JSONObject part = iteratorPart.next(); | ||
//Extract name of part | ||
String title_part = (String) part.get("Title"); | ||
//Extract order of part | ||
int order_part = (int) (long) part.get("Order"); | ||
|
||
//Extract list of steps | ||
ArrayList<Step> listStep = new ArrayList<>(); | ||
JSONArray listofStep = (JSONArray) part.get("Part"); | ||
|
||
Iterator<JSONObject> iteratorStep = listofStep.iterator(); | ||
while(iteratorStep.hasNext()){ | ||
//loop step array | ||
JSONObject step = iteratorStep.next(); | ||
//Extract main action | ||
String main_act = (String) step.get("Main_act"); | ||
//Extract order of step | ||
int order_step = (int) (long) step.get("Order"); | ||
//Extract detail action | ||
String detail_act = (String) step.get("Detail_act"); | ||
//Extract link of image | ||
String image = (String) step.get("Image"); | ||
|
||
Step newStep = new Step(order_step, main_act, detail_act, image); | ||
listStep.add(newStep); | ||
} | ||
Part newPart = new Part(order_part, title_part, listStep); | ||
listPart.add(newPart); | ||
} | ||
Method newMethod = new Method(order_method, title_method, listPart); | ||
answerJ.add(newMethod); | ||
} | ||
|
||
Question newQuestion = new Question(title, exp, answerJ, | ||
category, link, tips, warnings, video, things, ingredients, views, rate); | ||
return newQuestion; | ||
} | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package kb.howtokb.taskframe; | ||
|
||
|
||
public interface BasicDataPt { | ||
public int getID(); | ||
} |
Oops, something went wrong.