Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/kb/howtokb/taskframe/extractor/TextToOpenIEResult.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
229 lines (206 sloc)
8.09 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.taskframe.extractor; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.regex.Pattern; | |
import edu.knowitall.openie.Extraction; | |
import edu.knowitall.openie.Instance; | |
import edu.knowitall.openie.OpenIE; | |
import edu.knowitall.tool.parse.ClearParser; | |
import edu.knowitall.tool.postag.ClearPostagger; | |
import edu.knowitall.tool.srl.ClearSrl; | |
import edu.knowitall.tool.tokenize.ClearTokenizer; | |
import edu.stanford.nlp.util.Pair; | |
import kb.howtokb.global.Global; | |
import scala.collection.JavaConversions; | |
import scala.collection.Seq; | |
public class TextToOpenIEResult { | |
static double threshold = 0.45; //learning from experience | |
static Pattern pattern = Pattern.compile("[a-zA-Z]"); | |
static ArrayList<NounVerb> nvList = new ArrayList<>(); | |
static StanfordPOSTagger tagger = new StanfordPOSTagger(); | |
static OpenIE openIE = new OpenIE(new ClearParser(new ClearPostagger(new ClearTokenizer())), new ClearSrl(), false, false); | |
// public static void main(String[] args) throws FileNotFoundException, IOException { | |
// | |
//// String input = Global.ALL_ACT_FILE; | |
//// String result = Global.ACT_EXTRACT_FILE; | |
//// String onlytriple = Global.ONLY_ACT_EXTRACT_FILE; | |
//// tripleToFile(input, result, onlytriple); | |
// | |
// String inputtag = Global.ALL_ACT_FILE_TAG; | |
// String resulttag = Global.ACT_EXTRACT_FILE_TAG; | |
// String onlytripletag = Global.ONLY_ACT_EXTRACT_FILE_TAG; | |
// tripleToFileTag(inputtag, resulttag, onlytripletag); | |
// | |
// } | |
//Running extraction, return list of extraction | |
/** | |
* Extracting original triple using Open Ie tool | |
* @param sentence | |
* @return pair: number of extractions and a list of extractions | |
* @throws IOException | |
*/ | |
public Pair<Integer, ArrayList<Extraction>> extractOriTriple(String sent) throws IOException{ | |
sent = sent.substring(0, 1) + sent.substring(1).toLowerCase(); | |
ArrayList<Extraction> listTriple = new ArrayList<>(); | |
Seq<Instance> extractions = openIE.extract(sent); | |
List<Instance> list_extractions = JavaConversions.seqAsJavaList(extractions); | |
int num_ext = 0; | |
//If we got extraction from OpenIE | |
if (list_extractions.size() > 0){ | |
for(Instance instance : list_extractions) { | |
if (instance.confidence() >= threshold){ | |
listTriple.add(instance.extr()); | |
} | |
} | |
//Even we got triple from OpenIE, maybe all triples has conf < threshold | |
//We can add subject | |
if (listTriple.size() == 0){ | |
sent = addSubject(sent); | |
Seq<Instance> extractions2 = openIE.extract(sent); | |
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2); | |
num_ext += list_extractions2.size(); | |
if (list_extractions2.size() > 0){ | |
for(Instance instance : list_extractions2) { | |
if (instance.confidence() >= threshold){ | |
listTriple.add(instance.extr()); | |
} | |
} | |
} | |
}else num_ext += list_extractions.size(); | |
}else{ | |
//We dont get extraction from OpenIE | |
//We have to add dummy subject | |
sent = addSubject(sent); | |
Seq<Instance> extractions2 = openIE.extract(sent); | |
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2); | |
num_ext += list_extractions2.size(); | |
if (list_extractions2.size() > 0){ | |
for(Instance instance : list_extractions2) { | |
if (instance.confidence() >= threshold){ | |
listTriple.add(instance.extr()); | |
} | |
} | |
} | |
} | |
//System.out.println(list_extractions.size() + "\t" + listTriple.size()); | |
return new Pair<Integer, ArrayList<Extraction>>(num_ext, listTriple); | |
} | |
//Running extraction, return list of extraction to string | |
/** | |
* Extracting triple using open IE tools | |
* @param sentence | |
* @return list of triples which have confidence greater than the give threshold | |
* @throws IOException | |
*/ | |
public static ArrayList<String> extractTriple(String sent) throws IOException{ | |
sent = sent.substring(0, 1) + sent.substring(1).toLowerCase(); | |
ArrayList<String> listTriple = new ArrayList<>(); | |
Seq<Instance> extractions = openIE.extract(sent); | |
List<Instance> list_extractions = JavaConversions.seqAsJavaList(extractions); | |
//If we got extraction from OpenIE | |
if (list_extractions.size() > 0){ | |
for(Instance instance : list_extractions) { | |
if (instance.confidence() >= threshold){ | |
listTriple.add(instance.extr().tripleString()); | |
} | |
} | |
//Even we got triple from OpenIE, maybe all triples has conf < threshold | |
//We can add subject | |
if (listTriple.size() == 0){ | |
sent = addSubject(sent); | |
Seq<Instance> extractions2 = openIE.extract(sent); | |
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2); | |
if (list_extractions2.size() > 0){ | |
for(Instance instance : list_extractions2) { | |
if (instance.confidence() >= threshold){ | |
listTriple.add(instance.extr().tripleString()); | |
} | |
} | |
} | |
} | |
}else{ | |
//We dont get extraction from OpenIE | |
//We have to add dummy subject | |
sent = addSubject(sent); | |
Seq<Instance> extractions2 = openIE.extract(sent); | |
List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2); | |
if (list_extractions2.size() > 0){ | |
for(Instance instance : list_extractions2) { | |
if (instance.confidence() >= threshold){ | |
listTriple.add(instance.extr().tripleString()); | |
} | |
} | |
} | |
} | |
return listTriple; | |
} | |
//Add dummy subject | |
/** | |
* Add dummy subject | |
* @param sentence | |
* @return sentence with dummy subject added | |
* @throws IOException | |
*/ | |
public static String addSubject(String sent) throws IOException{ | |
String tagSent = tagger.tag(sent); | |
String firstTag = tagger.getTag(tagSent); | |
String firstWord = tagger.getToken(tagSent); | |
//System.out.println(firstWord); | |
//If tagger starts with VB | |
if (firstTag.contains("VB") || firstTag.equals("MD") || firstTag.equals("RB")){ | |
sent = Global.DUMMY_SUBJECT + " " + sent.toLowerCase(); | |
}else if (firstTag.contains("NN") || firstTag.equals("JJ")){ | |
//The sentence dont have verb at start | |
//Maybe it start with noun form of verb | |
//We need transfer this form to verb form | |
String tempSent = sent.substring(firstWord.length()); | |
sent = nounToVerb(firstWord) + tempSent; | |
sent = Global.DUMMY_SUBJECT + " " + sent.toLowerCase(); | |
} | |
return sent; | |
} | |
//Transfer nounform to verbform | |
/** | |
* Convert noun to verb | |
* @param noun | |
* @return | |
* @throws IOException | |
*/ | |
public static String nounToVerb(String noun) throws IOException{ | |
String verb = noun; | |
//Read dictionary file | |
if (nvList.size() == 0){ | |
ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); | |
InputStream inputs = classLoader.getResourceAsStream("noun-verb.txt"); | |
BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8")); | |
String sCurrentLine; | |
while ((sCurrentLine = br.readLine()) != null) { | |
//nounList.add(sCurrentLine); | |
String[] couple = sCurrentLine.split("\t"); | |
//System.out.println(sCurrentLine); | |
nvList.add(new NounVerb(couple[0], couple[1])); | |
} | |
br.close(); | |
} | |
for (NounVerb nv: nvList){ | |
if (nv.getNoun().equals(noun.toLowerCase())){ | |
if (!nv.getNoun().equals(nv.getVerb()) | |
&& nv.getNoun().substring(0, 2).equals(nv.getVerb().substring(0, 2))){ | |
verb = nv.getVerb(); | |
break; | |
} | |
// else if (!nv.getNoun().equals(nv.getVerb())){ | |
// tmpV2 = nv.getVerb(); | |
// } | |
} | |
} | |
// if (verb.equals(noun.toLowerCase())) | |
// verb = tmpV2; | |
return verb; | |
} | |
} |