src/kb/howtokb/taskframe/extractor/TextToOpenIEResult.java

package kb.howtokb.taskframe.extractor;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import edu.knowitall.openie.Extraction;
import edu.knowitall.openie.Instance;
import edu.knowitall.openie.OpenIE;
import edu.knowitall.tool.parse.ClearParser;
import edu.knowitall.tool.postag.ClearPostagger;
import edu.knowitall.tool.srl.ClearSrl;
import edu.knowitall.tool.tokenize.ClearTokenizer;
import edu.stanford.nlp.util.Pair;
import kb.howtokb.global.Global;
import scala.collection.JavaConversions;
import scala.collection.Seq;

public class TextToOpenIEResult {
	
	static double threshold = 0.45; //learning from experience
	static Pattern pattern = Pattern.compile("[a-zA-Z]");
	static ArrayList<NounVerb> nvList = new ArrayList<>();
	
	static StanfordPOSTagger tagger = new StanfordPOSTagger();
	static OpenIE openIE = new OpenIE(new ClearParser(new ClearPostagger(new ClearTokenizer())), new ClearSrl(), false, false);

	
//	public static void main(String[] args) throws FileNotFoundException, IOException {
//
////		String input = Global.ALL_ACT_FILE;
////		String result = Global.ACT_EXTRACT_FILE;
////		String onlytriple = Global.ONLY_ACT_EXTRACT_FILE;
////		tripleToFile(input, result, onlytriple);
//		
//		String inputtag = Global.ALL_ACT_FILE_TAG;
//		String resulttag = Global.ACT_EXTRACT_FILE_TAG;
//		String onlytripletag = Global.ONLY_ACT_EXTRACT_FILE_TAG;
//		tripleToFileTag(inputtag, resulttag, onlytripletag);
//		
//	}
	
	
	
	
	//Running extraction, return list of extraction
	/**
	 * Extracting original triple using Open Ie tool
	 * @param sentence
	 * @return pair: number of extractions and a list of extractions
	 * @throws IOException
	 */
	
	
	public Pair<Integer, ArrayList<Extraction>> extractOriTriple(String sent) throws IOException{
		sent = sent.substring(0, 1) + sent.substring(1).toLowerCase();
		ArrayList<Extraction> listTriple = new ArrayList<>();
		Seq<Instance> extractions = openIE.extract(sent);    
        List<Instance> list_extractions = JavaConversions.seqAsJavaList(extractions);
        int num_ext = 0;
        //If we got extraction from OpenIE
        if (list_extractions.size() > 0){
        	for(Instance instance : list_extractions) {
        		if (instance.confidence() >= threshold){
                    listTriple.add(instance.extr());
        		}
        	}
        	//Even we got triple from OpenIE, maybe all triples has conf < threshold
        	//We can add subject
        	if (listTriple.size() == 0){
        		sent = addSubject(sent);
            	Seq<Instance> extractions2 = openIE.extract(sent);    
                List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
                num_ext += list_extractions2.size();
                if (list_extractions2.size() > 0){
                	for(Instance instance : list_extractions2) {
                		if (instance.confidence() >= threshold){
                            listTriple.add(instance.extr());
                		}
                	}
                }
        	}else num_ext += list_extractions.size();
        }else{
        	//We dont get extraction from OpenIE
        	//We have to add dummy subject
        	sent = addSubject(sent);
        	Seq<Instance> extractions2 = openIE.extract(sent);    
            List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
            num_ext += list_extractions2.size();
            if (list_extractions2.size() > 0){
            	for(Instance instance : list_extractions2) {
            		if (instance.confidence() >= threshold){
                        listTriple.add(instance.extr());
            		}
            	}
            }
        }
        //System.out.println(list_extractions.size() + "\t" + listTriple.size());
        return new Pair<Integer, ArrayList<Extraction>>(num_ext, listTriple);
	}
	
	//Running extraction, return list of extraction to string
	/**
	 * Extracting triple using open IE tools
	 * @param sentence
	 * @return list of triples which have confidence greater than the give threshold
	 * @throws IOException
	 */
	public static ArrayList<String> extractTriple(String sent) throws IOException{
		sent = sent.substring(0, 1) + sent.substring(1).toLowerCase();
		ArrayList<String> listTriple = new ArrayList<>();
		Seq<Instance> extractions = openIE.extract(sent);    
        List<Instance> list_extractions = JavaConversions.seqAsJavaList(extractions);
        //If we got extraction from OpenIE
        if (list_extractions.size() > 0){
        	for(Instance instance : list_extractions) {
        		if (instance.confidence() >= threshold){
                    listTriple.add(instance.extr().tripleString());
        		}
        	}
        	//Even we got triple from OpenIE, maybe all triples has conf < threshold
        	//We can add subject
        	if (listTriple.size() == 0){
        		sent = addSubject(sent);
            	Seq<Instance> extractions2 = openIE.extract(sent);    
                List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
                if (list_extractions2.size() > 0){
                	for(Instance instance : list_extractions2) {
                		if (instance.confidence() >= threshold){
                            listTriple.add(instance.extr().tripleString());
                		}
                	}
                }
        	}
        }else{
        	//We dont get extraction from OpenIE
        	//We have to add dummy subject
        	sent = addSubject(sent);
        	Seq<Instance> extractions2 = openIE.extract(sent);    
            List<Instance> list_extractions2 = JavaConversions.seqAsJavaList(extractions2);
            if (list_extractions2.size() > 0){
            	for(Instance instance : list_extractions2) {
            		if (instance.confidence() >= threshold){
                        listTriple.add(instance.extr().tripleString());
            		}
            	}
            }
        }
        return listTriple;
	}
	
	
	//Add dummy subject
	/**
	 * Add dummy subject
	 * @param sentence
	 * @return sentence with dummy subject added
	 * @throws IOException
	 */
	public static String addSubject(String sent) throws IOException{
		String tagSent = tagger.tag(sent);
		String firstTag = tagger.getTag(tagSent);
		String firstWord = tagger.getToken(tagSent);
		//System.out.println(firstWord);
		//If tagger starts with VB
		if (firstTag.contains("VB") || firstTag.equals("MD") || firstTag.equals("RB")){
			sent = Global.DUMMY_SUBJECT + " " + sent.toLowerCase();
		}else if (firstTag.contains("NN") || firstTag.equals("JJ")){
			//The sentence dont have verb at start
			//Maybe it start with noun form of verb
			//We need transfer this form to verb form
			String tempSent = sent.substring(firstWord.length());
			sent = nounToVerb(firstWord) + tempSent;
			sent = Global.DUMMY_SUBJECT + " " + sent.toLowerCase();
		}
		return sent;
	}
	
	//Transfer nounform to verbform
	/**
	 * Convert noun to verb
	 * @param noun
	 * @return
	 * @throws IOException
	 */
	public static String nounToVerb(String noun) throws IOException{
		String verb = noun;
		//Read dictionary file
		if (nvList.size() == 0){
			
			ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
			InputStream inputs = classLoader.getResourceAsStream("noun-verb.txt");
			
			BufferedReader br = new BufferedReader(new InputStreamReader(inputs, "UTF-8"));

			String sCurrentLine;
			
			while ((sCurrentLine = br.readLine()) != null) {
				//nounList.add(sCurrentLine);
				
				String[] couple = sCurrentLine.split("\t");
				//System.out.println(sCurrentLine);
				nvList.add(new NounVerb(couple[0], couple[1]));
			}
			br.close();
		}
		
		for (NounVerb nv: nvList){
			if (nv.getNoun().equals(noun.toLowerCase())){
				if (!nv.getNoun().equals(nv.getVerb()) 
						&& nv.getNoun().substring(0, 2).equals(nv.getVerb().substring(0, 2))){
					verb = nv.getVerb();
					break;
				}
//				else if (!nv.getNoun().equals(nv.getVerb())){
//					tmpV2 = nv.getVerb();
//				}
			}
		}
//		if (verb.equals(noun.toLowerCase()))
//			verb = tmpV2;
		return verb;
	}
}