Skip to content
Permalink
ac4b295237
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
254 lines (226 sloc) 7.68 KB
package kb.howtokb.tools;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import kb.howtokb.reader.TaskFrameReader;
import kb.howtokb.taskframe.WikiHowTaskFrame;
import kb.howtokb.utils.AutoMap;
import kb.howtokb.utils.SQLiteJDBCConnector;
import kb.howtokb.wkhobject.Category_Json;
public class InformationExtraction {
private static Map<Integer, String> idtoCate;
private static Map<String, Integer> catetoID;
private static Map<Integer, String> idtoWikiURL;
// Get category id
/**
* Get category string
* @param category id
* @return category
* @throws NumberFormatException
* @throws IOException
*/
public static String getCategory(int id) throws NumberFormatException, IOException {
if (idtoCate == null) {
idtoCate = new HashMap<>();
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream("wikihow-id-category.txt");
try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
String sCurrentLine;
while ((sCurrentLine = reader.readLine()) != null) {
String[] line = sCurrentLine.split("\t");
int id1 = Integer.parseInt(line[0]);
String cate1 = line[1];
idtoCate.put(id1, cate1);
}
reader.close();
}
}
return idtoCate.get(id);
}
// Get category id
/**
* get category id
* @param cate
* @return id
* @throws NumberFormatException
* @throws IOException
*/
public static int getCategoryID(String cate) throws NumberFormatException, IOException {
if (catetoID == null) {
catetoID = new HashMap<>();
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream("wikihow-id-category.txt");
try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
String sCurrentLine;
while ((sCurrentLine = reader.readLine()) != null) {
String[] line = sCurrentLine.split("\t");
int id1 = Integer.parseInt(line[0]);
String cate1 = line[1];
catetoID.put(cate1, id1);
}
reader.close();
}
}
return catetoID.get(cate);
}
// Get url from id by reading file
/**
* get wiki url
* @param id
* @return url
* @throws NumberFormatException
* @throws IOException
*/
public static String getWikiURLStringFromFile(int id) throws NumberFormatException, IOException {
if (idtoWikiURL == null) {
idtoWikiURL = new HashMap<>();
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream("wikihow-id-url");
try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
String sCurrentLine;
while ((sCurrentLine = reader.readLine()) != null) {
String[] line = sCurrentLine.split("\t");
int id1 = Integer.parseInt(line[0]);
String url = line[1];
idtoWikiURL.put(id1, url);
}
reader.close();
}
}
return idtoWikiURL.get(id);
}
/**
* get all frame from a file
* @param inputfile
* @return
* @throws IOException
* @throws ParseException
*/
public static List<WikiHowTaskFrame> getAllFrame(String inputfile) throws IOException, ParseException {
System.out.println("Reading json file.......");
List<WikiHowTaskFrame> allframe = new ArrayList<>();
JSONParser parser = new JSONParser();
try (BufferedReader br = new BufferedReader(new FileReader(inputfile))) {
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
Object obj = parser.parse(sCurrentLine);
JSONObject jsonObject = (JSONObject) obj;
WikiHowTaskFrame newframe = TaskFrameReader.jsonToWikiHowTaskFrame(jsonObject);
allframe.add(newframe);
}
}
System.out.println("Done! Total number of instances: " + allframe.size());
return allframe;
}
// Get all activity frame in a same category
/**
* Get all frame which have the same category parent
* @param allframe
* @param category
* @return
* @throws Exception
*/
public List<WikiHowTaskFrame> getFrameFromCategory(List<WikiHowTaskFrame> allframe, String category) throws Exception {
System.out.println("Get a list of activities in a same category " + category);
List<WikiHowTaskFrame> list = new ArrayList<>();
List<Integer> listCate = InformationExtraction.getListofAllChildren(InformationExtraction.getCategoryID(category));
for (int i = 0; i < allframe.size(); i++) {
if (listCate.contains(Integer.parseInt(allframe.get(i).getActivity().getCategoryID()))) {
list.add(allframe.get(i));
}
}
System.out.println("Number of activities in a same category: " + list.size());
return list;
}
// get all children of a category
/**
* Get all categories which are children of a given category
* @param id
* @return
* @throws SQLException
* @throws ClassNotFoundException
* @throws IOException
*/
public static List<Integer> getListofAllChildren(int id) throws SQLException, ClassNotFoundException, IOException {
Map<Integer, List<Integer>> parentChains = new AutoMap<>();
// "rootpath":[57,54,52,150,1]
ResultSet rs = SQLiteJDBCConnector.q("select id, json from categoryjson");
while (rs.next()) {
try {
parentChains.put(rs.getInt(1), Category_Json.fromJson(rs.getString(2)).getRootpath());
} catch (Exception e) {
System.out.print("\n---- JSONException in category: " + rs.getInt(1));
}
}
List<Integer> res = new ArrayList<>();
for (Entry<Integer, List<Integer>> e : parentChains.entrySet()) {
if (e.getValue().contains(id))
if (!res.contains(e.getKey()))
res.add(e.getKey());
}
return res;
}
/**
* get map <id, activity frame> from a file
* @param inputfile
* @return
* @throws IOException
* @throws ParseException
*/
public static Map<Integer, WikiHowTaskFrame> getMapFrame(String inputfile) throws IOException, ParseException {
System.out.println("Reading json file.......");
Map<Integer, WikiHowTaskFrame> res = new HashMap<>();
JSONParser parser = new JSONParser();
try (BufferedReader br = new BufferedReader(new FileReader(inputfile))) {
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
Object obj = parser.parse(sCurrentLine);
JSONObject jsonObject = (JSONObject) obj;
WikiHowTaskFrame newframe = TaskFrameReader.jsonToWikiHowTaskFrame(jsonObject);
res.put(newframe.getID(), newframe);
}
}
System.out.println("Done! Total number of instances: " + res.size());
return res;
}
public static String linkToTitle(String s){
if (s.contains("http://www.wikihow.com/")){
s = s.substring("http://www.wikihow.com/".length());
s = s.replaceAll("-", " ").toLowerCase();
}
return s;
}
/**
* get all children of a category
* @param ids
* @return list of task name in surface form
* @throws SQLException
* @throws ClassNotFoundException
* @throws IOException
*/
public static List<String> getListofActivitySurfaceFromDb(List<Integer> ids) throws SQLException, ClassNotFoundException, IOException {
List<String> res = new ArrayList<>();
for (int i=0; i<ids.size(); i++){
//System.out.println(ids.get(i));
ResultSet rs = SQLiteJDBCConnector.q("select task from frameidtostrongactsurface where id=" + ids.get(i) + ";");
if (rs.next()){
String task = rs.getString("task");
//System.out.println(task);
res.add(task);
}
}
return res;
}
}