Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
465 lines (419 sloc) 15.1 KB
package kb.howtokb.taskframe.extractor;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import edu.stanford.nlp.util.Pair;
import kb.howtokb.utils.FileLines;
import kb.howtokb.utils.IDHelper;
import uk.ac.susx.informatics.Morpha;
public class HelperForOpenIE4Activities {
public static Set<String> readFileAsSet(String filePath,
boolean toLowerCase, boolean trim) {
String temp = "";
HashSet<String> lines = new HashSet<String>();
try {
BufferedReader in = new BufferedReader(new FileReader(filePath));
while ((temp = in.readLine()) != null) {
if (temp.length() > 0) {
String UTF8Str = new String(temp.getBytes(), "UTF-8");
if (trim)
UTF8Str = UTF8Str.trim();
lines.add(toLowerCase ? UTF8Str.toLowerCase() : UTF8Str);
}
}
in.close();
} catch (FileNotFoundException e) {
System.out.println("File not found, in reading file as list: "
+ e.getMessage());
} catch (IOException e) {
System.out.println("IOException in reading file as list: "
+ e.getMessage());
}
return lines;
}
/*public static String headWord(String s) {
String[] words = s.split(" ");
return words[headWord(words)];
}*/
public static String headWord(String s) throws SQLException, IOException {
return headWordStr(s.split(" "));
}
/** returns position of headword.
* @throws IOException
* @throws SQLException */
public static int headWord(String[] s) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
String w = s[i];
// tower of hanoi (here, tower = index 0, i.e. preceding "of")
if (prepositions.contains(w) && sb.length() > 0)
return i - 1;
sb.append(sb.length() == 0 ? "" : " ").append(w);
}
// Fallback to last position case when no prepositions are present.
return s.length - 1;
}
public static String headWordStr(String[] s) throws SQLException,
IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
String w = s[i];
// tower of hanoi (here, tower = index 0, i.e. preceding "of")
if (prepositions.contains(w) && sb.length() > 0)
break;
sb.append(sb.length() == 0 ? "" : " ").append(w);
}
return wnHeadWord(sb.toString().split(" "));
}
/**
* san francisco botanical garden -> botanical garden
* Loop over phrases in the sentence to find a valid WN noun phrase as the head words.
* @param s input phrase
* @return head words (valid WN noun phrases)
* @throws IOException
* @throws SQLException
*/
private static String wnHeadWord(String[] s) throws SQLException,
IOException {
StringBuilder phrase;
for (int i = 0; i < s.length; i++) {
phrase = new StringBuilder();
for (int j = i; j < s.length; j++)
phrase.append(phrase.length() > 0 ? " " : "").append(s[j]);
if (IDHelper.WNWords.inWN(phrase.toString()) != null)
return phrase.toString();
}
// Fallback to last position case when no prepositions are present.
return s[s.length - 1];
}
/************************************************************************
* Stems only head noun (e.g. will stem "schools boxes" to "schools box")
*
* @param MorphaDotPOS
* morpha code : Morpha.noun , Morpha.verb , or Morpha.any
************************************************************************/
/*public static String judiciousStemming(String phrase, int MorphaDotPOS) {
// instance e.g. Los Angeles doesn't need to be stemmed
// check if it's in WN
// things is a WN entry. how to stem things to thing? --> keep instance as it is
List<String> inWNList = new ArrayList<>();
if (phrase.indexOf(" ") > 0 && MorphaDotPOS == Morpha.noun) {
try {
inWNList = IDHelper.WNWords.inWN(phrase).second;
} catch (Exception e) {
}
}
if (MorphaDotPOS == Morpha.noun && inWNList != null
&& inWNList.size() > 0) {
return phrase;
} else {
// Only for noun
StringBuilder sb = new StringBuilder();
String[] splitted = phrase.split(" ");
if (MorphaDotPOS == Morpha.noun) {
int headWordIndex = headWord(splitted);
// The blue darts to blue darts.
for (int i = 0; i < splitted.length; i++) {
// if (!Util.isStopWord(splitted[i]))
sb.append(i > 0 ? ' ' : "");
if (i == headWordIndex)
sb.append(stem(splitted[i], MorphaDotPOS));
else
sb.append(splitted[i]);
}
} else {
for (int i = 0; i < splitted.length; i++) {
// if (!Util.isStopWord(splitted[i]))
sb.append(i > 0 ? ' ' : "").append(
stem(splitted[i], MorphaDotPOS));
}
}
if (sb.length() == 0)
sb.append(sb.length() > 0 ? ' ' : "").append(
stem(splitted[splitted.length - 1], MorphaDotPOS));
return sb.toString();
}
}*/
public static String judiciousStemming(String phrase, int MorphaDotPOS) {
if (MorphaDotPOS == Morpha.noun && isAnInstance(phrase)) { return phrase; }
StringBuilder sb = new StringBuilder();
String[] splitted = phrase.split(" ");
if (MorphaDotPOS == Morpha.noun) {
int headWordIndex = headWord(splitted);
// The blue darts to blue darts.
for (int i = 0; i < splitted.length; i++) {
// if (!Util.isStopWord(splitted[i]))
sb.append(i > 0 ? ' ' : "");
if (i == headWordIndex)
sb.append(stem(splitted[i], MorphaDotPOS));
else
sb.append(splitted[i]);
}
} else {
for (int i = 0; i < splitted.length; i++) {
sb.append(i > 0 ? ' ' : "").append(
stem(splitted[i], MorphaDotPOS));
}
}
if (sb.length() == 0)
sb.append(sb.length() > 0 ? ' ' : "").append(
stem(splitted[splitted.length - 1], MorphaDotPOS));
return sb.toString();
// }
}
/****
* Check if input phrase is eligible to be stemmed.
* (In short, we don't stem instances)
* @param phrase los_angeles -> false, boxes -> true
* @return
* @throws SQLException
* @throws IOException
*/
public static String stem(String w, int MorphaDotPOS) {
String stemmed = null;
if (MorphaDotPOS == Morpha.noun) {
stemmed = IrregularPlurals.PLURAL.getSingular(w);
}
if (stemmed == null || stemmed.isEmpty())
stemmed = stemExceptWN(w, MorphaDotPOS);
return stemmed;
}
private static boolean isAnInstance(String w) {
try {
Set<Character> types = IDHelper.WNWords.getWNWordTypes(w);
if (types == null)
return false;
else if (types.size() == 1 && types.contains('i'))
return true;
} catch (IOException | SQLException e) {}
return false;
}
private static String stemExceptWN(String w, int MorphaDotPOS) {
if (w.equals("_s"))
return w; // exception for _s which originally was 's
else {
String stemmed = stemMorpha(w, MorphaDotPOS);
String pos = "";
switch (MorphaDotPOS) {
case Morpha.noun:
pos = "n";
break;
case Morpha.verb:
pos = "v";
break;
}
Pair<String, Set<String>> inWN = null;
try {
inWN = IDHelper.WNWords.inWN(stemmed);
} catch (Exception e) {
e.printStackTrace();
}
if (inWN == null)
return w;
if (inWN.second == null || inWN.second.isEmpty())
return w;
if (pos.isEmpty())
return stemmed;
if (inWN.second.contains(pos))
return stemmed;
else
return w;
}
}
private static int countChar(String s, char ec, boolean shouldTrim) {
int count = 0;
if (shouldTrim)
s = s.trim();
for (char c : s.toCharArray()) {
if (c == ec)
count++;
}
return count;
}
/*************************************************************************
* That is, it only does noun plurals, pronoun case, and verb endings, and
* not things like comparative adjectives or derived nominals. It is based
* on a finite-state transducer implemented by John Carroll et al., written
* in flex and publicly available. See:
* http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html .
*
* @param w
* e.g. fighter jets
* @param morphaPOSNum
* (Use Morpha. static members) e.g. 2 (for noun), 3 (for any)
* @return fighter jet (note that fighters jets returns fighters jet
* @usage Util.stem("goes", Morpha.verb);
************************************************************************/
private static String stemMorpha(String w, int morphaPOSNum) {
try {
if (w == null || w.length() == 0)
return w;
int numWords = countChar(w, ' ', false) + 1;
String[] ws = null;
if (numWords > 1)
ws = w.split(" ");
if (lexer == null)
lexer = new Morpha(System.in);
lexer.yyreset(new StringReader(numWords == 1 ? w
: ws[ws.length - 1]));
lexer.yybegin(morphaPOSNum);
if (numWords == 1)
return lexer.next();
else {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < ws.length - 1; i++)
sb.append(ws[i]).append(" ");
sb.append(lexer.next());
return sb.toString();
}
} catch (Exception e) {
/*
* System.out.println("Exception in stemming (" + w + "): " +
* e.getMessage());
*/
// e.printStackTrace();
} catch (Error e) {
// Sometimes Morpha throws Error!
// Exception in thread "main" java.lang.Error: Error: could not
// match input
/*
* System.out.println("Error in stemming (" + w + "): " +
* e.getMessage());
*/
// e.printStackTrace();
}
return w;
}
static Morpha lexer;
public static Collection<String> copularVerbs = new HashSet<>(Arrays
.asList(new String[] {"be", "has", "have", "had", "is", "was", "are",
"were"}));
public static Collection<String> articles = new HashSet<String>(Arrays
.asList(new String[] {"a", "an", "the", "your", "my", "our", "his",
"her"}));
public static Collection<String> prepositions = new HashSet<String>(Arrays
.asList(new String[] {"in", "on", "at", "with", "into", "across",
"opposite", "toward", "towards", "through", "beyond", "aboard",
"amid", "past", "by", "near", "nearby", "above", "below", "over",
"under", "up", "down", "around", "through", "inside", "out", "outside",
"outside of", "between", "beside", "besides", "beyond",
"in front of", "in back of", "behind", "next to", "on top of",
"within", "beneath", "underneath", "among", "along", "against",
"aboard", "about", "above", "across", "after", "against", "along",
"amid", "among", "anti", "around", "as", "at", "before", "behind",
"below", "beneath", "beside", "besides", "between", "beyond",
"but", "by", "concerning", "considering", "despite", "down",
"during", "except", "excepting", "excluding", "following", "for",
"from", "in", "inside", "into", "in front of", "like", "minus",
"near", "of", "off", "on", "onto", "opposite", "outside", "over",
"past", "per", "plus", "regarding", "round", "save", "since",
"than", "through", "to", "toward", "towards", "under",
"underneath", "unlike", "until", "up", "upon", "versus", "via",
"with", "within", "without"}));
public static Collection<String> MODAL_VERBS = (new HashSet<String>(Arrays
.asList(new String[] {"can", "could", "may", "might", "will", "would",
"must", "shall", "should", "ought to"})));
public static Collection<String> STOPWORDS = (new HashSet<String>(Arrays
.asList(new String[] {"a", "able", "about", "across", "after", "all",
"almost", "also", "always", "am", "among", "an", "and", "another",
"any", "are", "as", "at", "be", "because", "been", "before",
"being", "but", "by", "can", "cannot", "could", "dear", "did",
"do", "does", "either", "else", "ever", "every", "few", "for",
"from", "get", "got", "had", "has", "have", "he", "her", "here",
"hers", "him", "his", "how", "however", "i", "if", "in", "into",
"is", "it", "its", "just", "least", "let", "like", "likely", "lrb",
"many", "may", "me", "might", "mine", "more", "most", "much",
"must", "my", "neither", "no", "none", "nor", "not", "nothing",
"now", "nt", "of", "off", "often", "on", "only", "or", "other",
"our", "ours", "own", "per", "rather", "rrb", "said", "say",
"says", "she", "should", "since", "so", "some", "somehow", "still",
"such", "than", "that", "the", "their", "theirs", "them", "then",
"there", "these", "they", "this", "those", "though", "tis", "to",
"too", "twas", "u", "us", "very", "want", "wants", "was", "we",
"were", "what", "when", "where", "which", "while", "who", "whom",
"why", "will", "with", "would", "www", "yet", "you", "your",
"yours", "yourss", "'m", "'ll", "a", "about", "above", "after",
"again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below",
"between", "both", "but", "by", "cannot", "could", "did", "do",
"does", "dont", "doesnt", "cant",
"doing", "down", "during", "each", "few", "for", "from",
"further", "had", "has", "have", "having", "he", "her", "here",
"hers", "herself", "him", "himself", "his", "how", "however", "i",
"if", "in", "into", "is", "it", "its", "itself", "let", "lrb",
"me", "more", "most", "must", "my", "myself", "no", "nor", "not",
"of", "off", "on", "once", "only", "or", "other", "ought", "our",
"ours ourselves", "out", "over", "own", "rrb", "same", "sha",
"she", "should", "so", "some", "such", "than", "that", "the",
"their", "theirs", "them", "themselves", "then", "there", "these",
"they", "this", "those", "through", "to", "too", "under", "until",
"up", "very", "was", "we", "were", "what", "when", "where",
"which", "while", "who", "who", "whom", "why", "why", "with", "wo",
"would", "would", "you", "you", "you", "you", "you", "your",
"yours", "yourself", "yourselves"})));
/** returns position of headverb from headverbCandidate s. */
public static int headVerb(String[] s) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
String w = s[i];
// begin to peel (here, peel = index 1, i.e. succeeding "to")
if (prepositions.contains(w) && sb.length() > 0
) {
// prep. not last word e.g. heat up
if (i != s.length - 1)
return i + 1;
/*
* else if return 0;
*/
}
sb.append(sb.length() == 0 ? "" : " ").append(w);
}
// had turned
if (s.length > 1
&& (copularVerbs.contains(s[0]) || MODAL_VERBS.contains(s[0]) || STOPWORDS
.contains(s[0]))) {
return 1 + headVerb(Arrays.copyOfRange(s, 1, s.length)); }
// Fallback to last position case when no prepositions are present.
// continue peeling
if (s[s.length - 1].endsWith("ing")) // present tense.
return s.length - 1;
else
// take out
/*
* return verbsInCorpus != null && verbsInCorpus.contains(s[0]) ? 0
* : s.length - 1;
*/
return 0;
}
private static enum IrregularPlurals {
PLURAL();
private final Map<String, String> pluralToSingularMap;
private IrregularPlurals() {
this.pluralToSingularMap = loadPlurals();
}
private Map<String, String> loadPlurals() {
Map<String, String> mapping = new HashMap<>();
// women woman
try {
String[] splitted;
for (String line : new FileLines("resources/irregular-plurals.txt")) {
splitted = line.split("\t");
mapping.put(splitted[0], splitted[1]);
}
} catch (Exception e) {}
return mapping;
}
public String getSingular(String pluralNoun) {
return PLURAL.pluralToSingularMap.get(pluralNoun);
}
}
}