Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Update WikiHow Task Frame Reader
  • Loading branch information
cxchu committed Feb 23, 2017
1 parent a45ba1c commit d956641
Show file tree
Hide file tree
Showing 12 changed files with 172,714 additions and 250 deletions.
3,071 changes: 3,071 additions & 0 deletions data/wikihow-id-category.txt

Large diffs are not rendered by default.

168,697 changes: 168,697 additions & 0 deletions data/wikihow-id-url

Large diffs are not rendered by default.

140 changes: 140 additions & 0 deletions src/main/java/kb/howtokb/extractor/TaskFrameReader.java
@@ -0,0 +1,140 @@
package kb.howtokb.extractor;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

import kb.howtokb.taskframe.WikiHowTask;
import kb.howtokb.taskframe.WikiHowTaskFrame;

public class TaskFrameReader {

static JSONParser parser = new JSONParser();

// Read json file and return all activity frame object
// TO DO
// ======================================================================
/**
* Return list of activity frame extract from file with json format
*
* @param directory
* that saves activity frame in json format
* @return a list of activity frame
* @throws IOException
* @throws ClassNotFoundException
* @throws ParseException
*/
public static ArrayList<WikiHowTaskFrame> extractWikiHowTaskFrameFromJSONFile(String directory)
throws IOException, ClassNotFoundException, ParseException {
ArrayList<WikiHowTaskFrame> allframe = new ArrayList<>();

try (BufferedReader br = new BufferedReader(new FileReader(directory))) {

String sCurrentLine;

while ((sCurrentLine = br.readLine()) != null) {
WikiHowTaskFrame newframe = jsonToWikiHowTaskFrame(sCurrentLine);
allframe.add(newframe);
}
}
return allframe;
}

//Read a string in json format and return an activity frame
public static WikiHowTaskFrame jsonToWikiHowTaskFrame(String jsonStr) throws ParseException {
return jsonToWikiHowTaskFrame((JSONObject) parser.parse(jsonStr));
}

// Read an Json object to transfer to activity frame
@SuppressWarnings("unchecked")
public static WikiHowTaskFrame jsonToWikiHowTaskFrame(JSONObject obj) {
// Extract id
int idframe = (int) (long) obj.get("id");

// Extract list of location
ArrayList<String> location = new ArrayList<>();
JSONArray locationJ = (JSONArray) obj.get("location");
Iterator<String> iteratorC = locationJ.iterator();
while (iteratorC.hasNext()) {
location.add(iteratorC.next());
}

// Extract list of temporal
ArrayList<String> temporal = new ArrayList<>();
JSONArray temporalJ = (JSONArray) obj.get("temporal");
Iterator<String> iteratorT = temporalJ.iterator();
while (iteratorT.hasNext()) {
temporal.add(iteratorT.next());
}

// Extract list of part A
ArrayList<String> partA = new ArrayList<>();
JSONArray partAJ = (JSONArray) obj.get("part-agent");
Iterator<String> iteratorA = partAJ.iterator();
while (iteratorA.hasNext()) {
partA.add(iteratorA.next());
}

// Extract list of part O
ArrayList<String> partO = new ArrayList<>();
JSONArray partOJ = (JSONArray) obj.get("part-object");
Iterator<String> iteratorO = partOJ.iterator();
while (iteratorO.hasNext()) {
partO.add(iteratorO.next());
}
// Activity extraction

JSONObject actJ = (JSONObject) obj.get("activity");
// id
int id = (int) (long) actJ.get("id");
// verb
String verb = (String) actJ.get("verb");
// object
String object = (String) actJ.get("object");
// ori verb
String oriverb = (String) actJ.get("ori-verb");
// ori-object
String oriobject = (String) actJ.get("ori-object");
// image
String image = (String) actJ.get("image");
// cate id
String cateid = (String) actJ.get("categoryid");
// link id
String linkid = (String) actJ.get("linkid");
// view
int view = (int) (long) actJ.get("view");
// rate
double rate = (double) (double) actJ.get("rating");
// cluster id
int clusterid = (int) (long) actJ.get("clusterid");
// cluster name
String clustername = (String) actJ.get("clustername");
// video
String video = (String) actJ.get("video");
// parent
String parent = (String) actJ.get("parent");
// prev
String prev = (String) actJ.get("prev");
// next
String next = (String) actJ.get("next");
// list of sub-activity
ArrayList<Integer> children = new ArrayList<>();
JSONArray childrenJ = (JSONArray) actJ.get("sub-activity");
Iterator<Long> iteratorS = childrenJ.iterator();
while (iteratorS.hasNext()) {
children.add((int) (long) iteratorS.next());
}

WikiHowTask newact = new WikiHowTask(id, verb, object, oriverb, oriobject, cateid, linkid, rate, view, image,
clusterid, clustername, video, children, parent, prev, next);

return (new WikiHowTaskFrame(idframe, newact, location, temporal, partA, partO));
}
}
208 changes: 208 additions & 0 deletions src/main/java/kb/howtokb/extractor/WikiHowArticleReader.java
@@ -0,0 +1,208 @@
package kb.howtokb.extractor;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.zip.GZIPInputStream;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

import kb.howtokb.global.Global;
import kb.howtokb.wkhobject.Category;
import kb.howtokb.wkhobject.Ingredients;
import kb.howtokb.wkhobject.Method;
import kb.howtokb.wkhobject.Part;
import kb.howtokb.wkhobject.Question;
import kb.howtokb.wkhobject.Step;
import kb.howtokb.wkhobject.Things;

public class WikiHowArticleReader {

/**
* Return list of articles extract from file with json format
* @param directory that saves articles in json format
* @return a list of articles
* @throws IOException
* @throws ClassNotFoundException
* @throws ParseException
*/
public static ArrayList<Question> WikiHowArticleReaderFromJSONFile(String directory) throws IOException, ClassNotFoundException, ParseException{
ArrayList<Question> allQuestions = new ArrayList<>();

JSONParser parser = new JSONParser();
try (BufferedReader br = new BufferedReader(new FileReader(directory))) {

String sCurrentLine;

while ((sCurrentLine = br.readLine()) != null) {
Object obj = parser.parse(sCurrentLine);
JSONObject jsonObject = (JSONObject) obj;
Question newQuestion = jsonToQuestion(jsonObject);
allQuestions.add(newQuestion);
}
}
return allQuestions;
}


// ======================================================================
/**
* Convert a Json object to Question Object
* @param JSON Object
* @return Question Object
*/
@SuppressWarnings("unchecked")
public static Question jsonToQuestion(JSONObject jsonobj){
//Extract link
String link = (String) jsonobj.get("Link");

//Extract title
String title = (String) jsonobj.get("Title");

//Extract explanation
String exp = (String) jsonobj.get("Explanation");

//Extract views
int views = (int) (long) jsonobj.get("Views");

//Extract rate
double rate = (double) (double) jsonobj.get("Rate");

//Extract tips
String tips = (String) jsonobj.get("Tips");

//Extract warnings
String warnings = (String) jsonobj.get("Warnings");

//Extract link of video
String video = (String) jsonobj.get("Video");

//Extract category
ArrayList<Category> category = new ArrayList<Category>();
ArrayList<String> cate_string = new ArrayList<String>();
JSONArray categoryJ = (JSONArray) jsonobj.get("Category");
Iterator<String> iteratorC = categoryJ.iterator();
while (iteratorC.hasNext()) {
cate_string.add(iteratorC.next());
}
cate_string.add(0, "");
cate_string.add(cate_string.size(), "");
for (int k=1; k<cate_string.size()-1; k++){
Category newCate = new Category(cate_string.get(k), cate_string.get(k-1), cate_string.get(k+1));
category.add(newCate);
}

//Extract things
ArrayList<Things> things = new ArrayList<Things>();
JSONArray thingJ = (JSONArray) jsonobj.get("Things");
Iterator<JSONObject> iteratorT = thingJ.iterator();
while (iteratorT.hasNext()) {
JSONObject thing = iteratorT.next();
//Title of method
String title_thing = (String) thing.get("Title");
//List of things of this method
ArrayList<String> listthing = new ArrayList<String>();
//List things of a method
JSONArray thing_method = (JSONArray) thing.get("Things");
Iterator<String> iteratorTM = thing_method.iterator();
while (iteratorTM.hasNext()) {
listthing.add(iteratorTM.next());
}
Things newThing = new Things(title_thing, listthing);
things.add(newThing);
}


//Extract ingredients
ArrayList<Ingredients> ingredients = new ArrayList<Ingredients>();
JSONArray ingredientJ = (JSONArray) jsonobj.get("Ingredients");
Iterator<JSONObject> iteratorI = ingredientJ.iterator();
while (iteratorI.hasNext()) {
JSONObject ingre = iteratorI.next();
//Title of method
String title_ingre = (String) ingre.get("Title");
//List of things of this method
ArrayList<String> listingre = new ArrayList<String>();
//List things of a method
JSONArray ingre_method = (JSONArray) ingre.get("Ingredients");
Iterator<String> iteratorIM = ingre_method.iterator();
while (iteratorIM.hasNext()) {
listingre.add(iteratorIM.next());
}
Ingredients newIngre = new Ingredients(title_ingre, listingre);
ingredients.add(newIngre);
}

//Extract answer
// loop array
//Extract list of methods
ArrayList<Method> answerJ = new ArrayList<Method>();
JSONArray answer = (JSONArray) jsonobj.get("Answer");
Iterator<JSONObject> iterator = answer.iterator();
while (iterator.hasNext()) {
// loop method array
JSONObject method = iterator.next();
//Extract name of method
String title_method = (String) method.get("Title");
//Extract order of method
int order_method = (int) (long) method.get("Order");

//extract list of part
ArrayList<Part> listPart = new ArrayList<>();
JSONArray listofPart = (JSONArray) method.get("Method");

Iterator<JSONObject> iteratorPart = listofPart.iterator();
while(iteratorPart.hasNext()){
// loop part array
JSONObject part = iteratorPart.next();
//Extract name of part
String title_part = (String) part.get("Title");
//Extract order of part
int order_part = (int) (long) part.get("Order");

//Extract list of steps
ArrayList<Step> listStep = new ArrayList<>();
JSONArray listofStep = (JSONArray) part.get("Part");

Iterator<JSONObject> iteratorStep = listofStep.iterator();
while(iteratorStep.hasNext()){
//loop step array
JSONObject step = iteratorStep.next();
//Extract main action
String main_act = (String) step.get("Main_act");
//Extract order of step
int order_step = (int) (long) step.get("Order");
//Extract detail action
String detail_act = (String) step.get("Detail_act");
//Extract link of image
String image = (String) step.get("Image");

Step newStep = new Step(order_step, main_act, detail_act, image);
listStep.add(newStep);
}
Part newPart = new Part(order_part, title_part, listStep);
listPart.add(newPart);
}
Method newMethod = new Method(order_method, title_method, listPart);
answerJ.add(newMethod);
}

Question newQuestion = new Question(title, exp, answerJ,
category, link, tips, warnings, video, things, ingredients, views, rate);
return newQuestion;
}


}
6 changes: 6 additions & 0 deletions src/main/java/kb/howtokb/taskframe/BasicDataPt.java
@@ -0,0 +1,6 @@
package kb.howtokb.taskframe;


public interface BasicDataPt {
public int getID();
}

0 comments on commit d956641

Please sign in to comment.