Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
379 lines (337 sloc) 11.7 KB
package kb.howtokb.taskframe.extractor;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Collection;
import kb.howtokb.global.Global;
import kb.howtokb.utils.Triple;
import uk.ac.susx.informatics.Morpha;
/**
* This class is used as a wrapper over OpenIE tuple based results. It checks
* for appropriateness of the srl, and normalizes the tuples including nouns to
* head noun and verb phrases to head verb phrase.
* <p>
* Try Sample code: normalizeOpenIEResult("(You; Categorize; a wikiHow Article)"
* , ";", true);
*
* @author ntandon
*
* TODO: create test cases based on email exchange
*/
public class OpenIE4Activities {
public static String normalizeV(String v, boolean pickHeadWords, boolean freqCheck) {
v = HelperForOpenIE4Activities.judiciousStemming(v, Morpha.verb);
if (pickHeadWords) {
String[] vSplitted = v.split(" ");
// present as a verb phrase in WordNet?
v = isCommonVerb(v) ? v : vSplitted[HelperForOpenIE4Activities.headVerb(vSplitted)];
}
// Open IE attracts a lot of noise, remove it by checking if the verb
// is frequent w.r.t. WordNet
if (freqCheck && !isCommonVerb(v))
return "";
else if (!isStopWord(v))
return v;
else return "";
}
public static String normalizeO(String o, boolean pickHeadWords, boolean freqCheck) throws IOException {
o = HelperForOpenIE4Activities.judiciousStemming(o, Morpha.noun);
o = dropLeadingArticles(o);
if (pickHeadWords) {
try {
o = HelperForOpenIE4Activities.headWordStr(o.split(" "));
} catch (SQLException | IOException e) {
throw new IOException("Some of the WordNet related files are not set, check NLPUtil");
}
}
// Open IE attracts a lot of noise, remove it by checking if the noun
// is frequent w.r.t. WordNet
if (freqCheck && !isCommonNoun(o))
return "";
else if (!isStopWord(o))
return o;
else return "";
}
/**
* Input:(You; Categorize; a wikiHow Article) <br>
* Output:you;categorize;article
*
* @param input:
* tuple format result from openIE4.2
* @param SVOSeparator:
* usually semicolon(;)
* @param aggressively:
* usually set to true enables extracting head word from
* (v,o1..oN) arguments of openIE
* @param freqCheck
* if the noun and verb are frequent enough
* @return normalized result and empty if the triple contains no meaningful
* activity
* @throws IOException:
* WordNet files must be appropriately loaded
*/
public static String normalizeOpenIEResult(String input, String SVOSeparator, boolean pickHeadWords,
boolean freqCheck) throws IOException {
String result = "";
Triple<String, String, String> voTriple = voFromOpenIETriple(input, SVOSeparator);
String vo = voTriple.second;
int voBoundary = vo.indexOf(SVOSeparator);
if (voBoundary <= 0)
return result;
String v = vo.substring(0, voBoundary);
String o = vo.substring(voBoundary + 1);
String[] owords = o.split(" ");
if (owords.length == 0)
return result;
String prep = "";
if (HelperForOpenIE4Activities.prepositions.contains(owords[0])) {
prep = owords[0];
o = "";
for (int i = 1; i < owords.length; i++)
o += owords[i] + (i < owords.length - 1 ? " " : "");
}
if (o.length() == 0)
return result;
o = normalizeO(o, pickHeadWords, freqCheck);
v = normalizeV(v, pickHeadWords, freqCheck);
// Open IE attracts a lot of noise, remove it by checking if the verb or
// noun
// is frequent w.r.t. WordNet
if (o.length() == 0 || v.length() == 0)
return result;
result = (voTriple.first + SVOSeparator + v + (prep.length() == 0 ? "" : " " + prep) + SVOSeparator + o
+ (voTriple.third.length() == 0 ? "" : (SVOSeparator + voTriple.third)));
return result;
}
/**
* no frequency check, rest functionality same as See
* {@linkplain #normalizeOpenIEResult(String, String, boolean, boolean)}
*/
public static String normalizeOpenIEResult(String input, String SVOSeparator, boolean pickHeadWords)
throws IOException {
return normalizeOpenIEResult(input, SVOSeparator, pickHeadWords, false);
}
/**
* Re-estimates verb and main object and trailing objects from OpenIE Result
* e.g.
*
* @param tupleFromOpenIE
* @param sVOSeparator
* @return
*/
private static Triple<String, String, String> voFromOpenIETriple(String tupleFromOpenIE, String sVOSeparator) {
String input = tupleFromOpenIE.toLowerCase();
// clean up the tuple
input = input.replace("(", "").replace(")", "");
String[] components = input.split(sVOSeparator);
String svo = contentOfRole(components[1]) + sVOSeparator + contentOfRole(components[2]);
String lead = contentOfRole(components[0]); // the subject
String trail = ""; // the several objects/ SRLs
for (int i = 3; i < components.length; i++) {
String normSRL = normalizeSRL(components[i].trim());
if (normSRL.length() > 0)
trail += (trail.length() == 0 ? "" : sVOSeparator) + normSRL;
}
return new Triple<String, String, String>(lead, svo, trail);
}
/**
* Takes a time or location role+word and normalizes it
*
* @param srl
* e.g. l:your brother's room
* @return empty string (len=0) if inappropriate srl (e.g. t:at that road is
* inappropriate)
*/
private static String normalizeSRL(String srl) {
boolean isTime = srl.startsWith("t:");
boolean isLoc = srl.startsWith("l:");
// Return if no SRL marker is present
if (!isTime && !isLoc)
return srl;
// drop SRL marker.
String content = srl.substring(2);
boolean validTime = isAppropriateAsTime(content);
boolean validLoc = isAppropriateAsLoc(content);
if (isTime && validTime)
return srl;
// time can be wrongly labeled as location, fix.
else if (isLoc && validTime)
return "t:" + content;
else if (isLoc && validLoc)
return "l:" + normalizeLocation(content);
else
return ""; // invalid time.
}
/**
* Checks if OpenIE's l:somestring is appropriate, must start with a
* preposition
*
* @param s
* e.g. l:in your house
* @return true if the input indicates location
*/
private static boolean isAppropriateAsLoc(String s) {
String[] owords = s.split(" ");
return HelperForOpenIE4Activities.prepositions.contains(owords[0]);
}
/**
* Normalize location string that can be arbitrarily long
*
* @param l
* e.g. at your room
* @return e.g. room
*/
private static String normalizeLocation(String l) {
String[] ls = l.split(" ");
l = dropLeadingPrep(ls);
l = dropLeadingArticles(l);
l = HelperForOpenIE4Activities.judiciousStemming(l, Morpha.noun);
return ls[0] + " " + l;
}
// TODO can set the noun wordnet-frequency threshold to >=0 or >=1.
private static Collection<String> wnTime = HelperForOpenIE4Activities.readFileAsSet("resources/time-related.txt", false, false),
freqverbs = HelperForOpenIE4Activities.readFileAsSet("resources/freqgt0-verbs-wn.txt",
false, false),
freqnouns = HelperForOpenIE4Activities.readFileAsSet("resources/freqgt0-nouns-wn.txt",
false, false),
livingBeings = HelperForOpenIE4Activities
.readFileAsSet("resources/freqgte1-living-beings.txt", false, false);
// private static Collection<String> wnTime = DBConnector.singletonColumnToSet(1, "select * from "+Global.WNTIME),
// freqverbs = DBConnector.singletonColumnToSet(1, "select * from "+Global.WNVERB),
// freqnouns = DBConnector.singletonColumnToSet(1, "select * from "+Global.WNNOUN),
// livingBeings = DBConnector.singletonColumnToSet(1, "select * from "+Global.WNAGENT);
/**
* Time is usually a closed set of WordNet words, look up that list <br>
* <p>
* This query checks for inherited hyponymy for time unit, time period and
* event synsets:
*
* <PRE>
select regexp_replace(word,'_',' ') from nounid where synsetid in
(select distinct on (synset_id) synset_id from wordnet.wn_hyponymy_tree_complete
where hyponymy like '%,100029378%' or hyponymy like '%,115154774%' or
hyponymy like '%,115113229%' ) intersect select distinct(word)
from wordnet.wn_synsets where tag_count >=1 and ss_type='n' and lower(word)=word
* </PRE>
*
* @param s
* e.g. divorce
* @return true if time related word
*/
private static boolean isAppropriateAsTime(String s) {
String[] owords = s.split(" ");
// there must be a preposition marker otherwise time is ill-defined.
if (HelperForOpenIE4Activities.prepositions.contains(owords[0]))
return wnTime.contains(dropLeadingPrep(owords));
else
return false;
}
/**
* Input = [during, easter] Output = easter
*
* @param words
* : an array is usually precomputed, avoids splitting string
* twice.
* @return
*/
private static String dropLeadingPrep(String[] words) {
String o = "";
for (int i = 1; i < words.length; i++)
o += words[i] + (i < words.length - 1 ? " " : "");
return o;
}
/**
* Input = t:during easter Output = during easter
*
* @param s
* (role label and content from OpenIE's preposition based SRL)
* @return
*/
private static String contentOfRole(String s) {
if (s.length() == 0)
return s;
// drop s: or l:
return ((s.startsWith("l:") || s.startsWith("t:")) ? s.substring(2) : s).trim();
}
/**
* FreqVerbs are preloaded from files constructed using this query.
*
* <PRE>
*
select distinct(word) from wordnet.wn_synsets where tag_count >=1 and // can consider higher cutoff
(ss_type='v') and // verbs
lower(word)=word // this avoids instances like Angela Merkel to show up as a noun
* </PRE>
*
* Note: Additional constraint for verb in constructing the file from
* WordNet:<br>
* length(word)>1 // don't need len=1 wierd words
*
* @param v
* a verb e.g. move
* @return true if the verb is acceptable.
*/
private static boolean isCommonVerb(String v) {
return freqverbs.contains(v);
}
/**
* Similar to {@link #isCommonVerb(String)}, except that restriction on noun
* length >2
*
* @param n
* a verb e.g. tree (not a phrase because usually this function
* is invoked as the last stage of normalization)
* @return true if the noun is acceptable.
*/
private static boolean isCommonNoun(String n) {
return freqnouns.contains(n);
}
/**
* Uses wordnet tops (top level category against a synset) and >=1 freq
* threshold and len >2 to construct a list of living beings nouns.<br>
* <br>
* select word from wordnet.wn_synsets where tag_count >=1 and ss_type='n'
* and length(word)>2 and lower(word)=word and synset_id in( select
* synset_id from wordnet.wn_domain_tops where tops='animal' or
* tops='person'
*
* <br>
* <br>
* NOTE: plant is another top level category but we don't consider it living
* for this purpose because in the KB, people would not expect plants to be
* living agents
*
* @param n
* e.g. fish
* @return true if living being
*/
public static boolean isLivingBeing(String n) {
return livingBeings.contains(n);
}
/**
* Input = a lion, Output = lion
*/
private static String dropLeadingArticles(String w) {
if (w == null || w.isEmpty())
return "";
String[] words = w.split(" ");
StringBuilder result = new StringBuilder();
for (int i = 0; i < words.length; i++) {
// Until the first non-stop word is seen.
if (isArticle(words[i]) && result.length() == 0)
continue;
else
result.append(result.length() == 0 ? "" : " ").append(words[i]);
}
return result.toString();
}
/**
* Input = a/an/the/his/her (see {@link #articles}).., Output= true
*/
private static boolean isArticle(String s) {
return HelperForOpenIE4Activities.articles.contains(s);
}
//Check stop-words
private static boolean isStopWord(String v) {
return HelperForOpenIE4Activities.STOPWORDS.contains(v);
}
}