Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
229 lines (206 sloc) 8.09 KB
package kb.howtokb.taskframe.extractor;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import edu.knowitall.openie.Extraction;
import edu.knowitall.openie.Instance;
import edu.knowitall.openie.OpenIE;
import edu.knowitall.tool.parse.ClearParser;
import edu.knowitall.tool.postag.ClearPostagger;
import edu.knowitall.tool.srl.ClearSrl;
import edu.knowitall.tool.tokenize.ClearTokenizer;
import edu.stanford.nlp.util.Pair;
import kb.howtokb.global.Global;
import scala.collection.JavaConversions;
import scala.collection.Seq;
public class TextToOpenIEResult {
static double threshold = 0.45; //learning from experience
static Pattern pattern = Pattern.compile("[a-zA-Z]");
static ArrayList<NounVerb> nvList = new ArrayList<>();
static StanfordPOSTagger tagger = new StanfordPOSTagger();
static OpenIE openIE = new OpenIE(new ClearParser(new ClearPostagger(new ClearTokenizer())), new ClearSrl(), false, false);
// public static void main(String[] args) throws FileNotFoundException, IOException {
//
//// String input = Global.ALL_ACT_FILE;
//// String result = Global.ACT_EXTRACT_FILE;
//// String onlytriple = Global.ONLY_ACT_EXTRACT_FILE;
//// tripleToFile(input, result, onlytriple);
//
// String inputtag = Global.ALL_ACT_FILE_TAG;
// String resulttag = Global.ACT_EXTRACT_FILE_TAG;
// String onlytripletag = Global.ONLY_ACT_EXTRACT_FILE_TAG;
// tripleToFileTag(inputtag, resulttag, onlytripletag);
//
// }
//Running extraction, return list of extraction
/**
* Extracting original triple using Open Ie tool
* @param sentence
* @return pair: number of extractions and a list of extractions
* @throws IOException
*/
public Pair<Integer, ArrayList<Extraction>> extractOriTriple(String sent) throws IOException{
sent = sent.substring(0, 1) + sent.substring(1).toLowerCase();
ArrayList<Extraction> listTriple = new ArrayList<>();
Seq<Instance> extractions = openIE.extract(sent);
List<Instance> list_extractions = JavaConversions.seqAsJavaList(extractions);
int num_ext = 0;
//If we got extraction from OpenIE
if (list_extractions.size() > 0){
for(Instance instance : list_extractions) {
if (instance.confidence() >= threshold){
listTriple.add(instance.extr());
}
}
//Even we got triple from OpenIE, maybe all triples has conf < threshold
//We can add subject
if (listTriple.size() == 0){
sent = addSubject(sent);
Seq<Instance> extractions2 = openIE.extract(sent);
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
num_ext += list_extractions2.size();
if (list_extractions2.size() > 0){
for(Instance instance : list_extractions2) {
if (instance.confidence() >= threshold){
listTriple.add(instance.extr());
}
}
}
}else num_ext += list_extractions.size();
}else{
//We dont get extraction from OpenIE
//We have to add dummy subject
sent = addSubject(sent);
Seq<Instance> extractions2 = openIE.extract(sent);
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
num_ext += list_extractions2.size();
if (list_extractions2.size() > 0){
for(Instance instance : list_extractions2) {
if (instance.confidence() >= threshold){
listTriple.add(instance.extr());
}
}
}
}
//System.out.println(list_extractions.size() + "\t" + listTriple.size());
return new Pair<Integer, ArrayList<Extraction>>(num_ext, listTriple);
}
//Running extraction, return list of extraction to string
/**
* Extracting triple using open IE tools
* @param sentence
* @return list of triples which have confidence greater than the give threshold
* @throws IOException
*/
public static ArrayList<String> extractTriple(String sent) throws IOException{
sent = sent.substring(0, 1) + sent.substring(1).toLowerCase();
ArrayList<String> listTriple = new ArrayList<>();
Seq<Instance> extractions = openIE.extract(sent);
List<Instance> list_extractions = JavaConversions.seqAsJavaList(extractions);
//If we got extraction from OpenIE
if (list_extractions.size() > 0){
for(Instance instance : list_extractions) {
if (instance.confidence() >= threshold){
listTriple.add(instance.extr().tripleString());
}
}
//Even we got triple from OpenIE, maybe all triples has conf < threshold
//We can add subject
if (listTriple.size() == 0){
sent = addSubject(sent);
Seq<Instance> extractions2 = openIE.extract(sent);
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
if (list_extractions2.size() > 0){
for(Instance instance : list_extractions2) {
if (instance.confidence() >= threshold){
listTriple.add(instance.extr().tripleString());
}
}
}
}
}else{
//We dont get extraction from OpenIE
//We have to add dummy subject
sent = addSubject(sent);
Seq<Instance> extractions2 = openIE.extract(sent);
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
if (list_extractions2.size() > 0){
for(Instance instance : list_extractions2) {
if (instance.confidence() >= threshold){
listTriple.add(instance.extr().tripleString());
}
}
}
}
return listTriple;
}
//Add dummy subject
/**
* Add dummy subject
* @param sentence
* @return sentence with dummy subject added
* @throws IOException
*/
public static String addSubject(String sent) throws IOException{
String tagSent = tagger.tag(sent);
String firstTag = tagger.getTag(tagSent);
String firstWord = tagger.getToken(tagSent);
//System.out.println(firstWord);
//If tagger starts with VB
if (firstTag.contains("VB") || firstTag.equals("MD") || firstTag.equals("RB")){
sent = Global.DUMMY_SUBJECT + " " + sent.toLowerCase();
}else if (firstTag.contains("NN") || firstTag.equals("JJ")){
//The sentence dont have verb at start
//Maybe it start with noun form of verb
//We need transfer this form to verb form
String tempSent = sent.substring(firstWord.length());
sent = nounToVerb(firstWord) + tempSent;
sent = Global.DUMMY_SUBJECT + " " + sent.toLowerCase();
}
return sent;
}
//Transfer nounform to verbform
/**
* Convert noun to verb
* @param noun
* @return
* @throws IOException
*/
public static String nounToVerb(String noun) throws IOException{
String verb = noun;
//Read dictionary file
if (nvList.size() == 0){
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
InputStream inputs = classLoader.getResourceAsStream("noun-verb.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"));
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null) {
//nounList.add(sCurrentLine);
String[] couple = sCurrentLine.split("\t");
//System.out.println(sCurrentLine);
nvList.add(new NounVerb(couple[0], couple[1]));
}
br.close();
}
for (NounVerb nv: nvList){
if (nv.getNoun().equals(noun.toLowerCase())){
if (!nv.getNoun().equals(nv.getVerb())
&& nv.getNoun().substring(0, 2).equals(nv.getVerb().substring(0, 2))){
verb = nv.getVerb();
break;
}
// else if (!nv.getNoun().equals(nv.getVerb())){
// tmpV2 = nv.getVerb();
// }
}
}
// if (verb.equals(noun.toLowerCase()))
// verb = tmpV2;
return verb;
}
}