Skip to content
Permalink
3b99e145f5
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
405 lines (365 sloc) 13.4 KB
package utils;
import java.io.IOException;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import uk.ac.susx.informatics.Morpha;
import util.FileLines;
import util.Util;
import util.WordListEnum;
import util.IDHelper.WNWords;
import util.Pair;
public class NLPUtils {
/*
* public static String headWord(String s) { String[] words = s.split(" ");
* return words[headWord(words)]; }
*/
public static String headWord(String s) throws SQLException, IOException {
return headWordStr(s.split(" "));
}
/** returns position of headword.
* <b> In any case returns exactly one position! cannot return a phrase </b>
* @throws IOException
* @throws SQLException */
public static int headWord(String[] s) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
String w = s[i];
// tower of hanoi (here, tower = index 0, i.e. preceding "of")
if (WordListEnum.PREPOSITIONS.contains(w) && sb.length() > 0)
if (WordListEnum.PLURAL_QUANTIFIER.contains(Util.rangeWords(0,
i, s)))
// couple of sketch pads => sketch pads (i.e. after preposition!)
return i + 1;
else
return i - 1;
sb.append(sb.length() == 0 ? "" : " ").append(w);
}
// Fallback to last position case when no prepositions are present.
return s.length - 1;
}
public static String headWordStr(String[] s) throws SQLException,
IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
String w = s[i];
// tower of hanoi (here, tower = index 0, i.e. preceding "of")
// couple of sketch pads => sketch pads (i.e. after preposition!)
if (WordListEnum.PREPOSITIONS.contains(w) && sb.length() > 0) {
// PLURAL_QUANTIFIER (e.g. couple of)
// couple of sketch pads => check from 0 to i (i.e. couple of)
if (WordListEnum.PLURAL_QUANTIFIER.contains(Util.rangeWords(0,
i, s))) {
// couple of sketch pads => sketch pads is a cand. wnhead
sb = new StringBuilder();
sb.append(Util.rangeWords(i + 1, s.length - 1, s));
}
// in any case break because we already filled the head cand.
break;
}
sb.append(sb.length() == 0 ? "" : " ").append(w);
}
// sketch pads => sketch pad.
// Verbs are handled separately via headVerb() func,
// hence stem using Morpha.any
return wnHeadWord(stem(sb.toString(), Morpha.any).split(" "));
}
/**
* san francisco botanical garden -> botanical garden Loop over phrases in
* the sentence to find a valid WN noun phrase as the head words.
*
* @param s
* input phrase
* @return head words (valid WN noun phrases)
* @throws IOException
* @throws SQLException
*/
private static String wnHeadWord(String[] s) throws SQLException,
IOException {
StringBuilder phrase;
for (int i = 0; i < s.length; i++) {
phrase = new StringBuilder();
for (int j = i; j < s.length; j++)
phrase.append(phrase.length() > 0 ? " " : "").append(s[j]);
if (WNWords.inWN(phrase.toString()) != null)
return phrase.toString();
}
// Fallback to last position case when no prepositions are present.
return s[s.length - 1];
}
/************************************************************************
* Stems only head noun (e.g. will stem "schools boxes" to "schools box")
*
* @param MorphaDotPOS
* morpha code : Morpha.noun , Morpha.verb , or Morpha.any
************************************************************************/
public static String judiciousStemming(String phrase, int MorphaDotPOS) {
if (MorphaDotPOS == Morpha.noun) { return phrase; }
StringBuilder sb = new StringBuilder();
String[] splitted = phrase.split(" ");
if (MorphaDotPOS == Morpha.noun) {
int headWordIndex = headWord(splitted);
// The blue darts to blue darts.
for (int i = 0; i < splitted.length; i++) {
// if (!Util.isStopWord(splitted[i]))
sb.append(i > 0 ? ' ' : "");
if (i == headWordIndex)
sb.append(stem(splitted[i], MorphaDotPOS));
else
sb.append(splitted[i]);
}
} else {
for (int i = 0; i < splitted.length; i++) {
sb.append(i > 0 ? ' ' : "").append(
stem(splitted[i], MorphaDotPOS));
}
}
if (sb.length() == 0)
sb.append(sb.length() > 0 ? ' ' : "").append(
stem(splitted[splitted.length - 1], MorphaDotPOS));
return sb.toString();
// }
}
/****
* Check if input phrase is eligible to be stemmed. (In short, we don't stem
* instances)
*
* @param phrase
* los_angeles -> false, boxes -> true
* @return
* @throws SQLException
* @throws IOException
*/
public static String stem(String w, int MorphaDotPOS) {
String stemmed = null;
if (MorphaDotPOS == Morpha.noun) {
stemmed = IrregularPlurals.PLURAL.getSingular(w);
}
if (stemmed == null || stemmed.isEmpty())
stemmed = stemExceptWN(w, MorphaDotPOS);
return stemmed;
}
private static String stemExceptWN(String w, int MorphaDotPOS) {
if (w.equals("_s"))
return w; // exception for _s which originally was 's
else {
String stemmed = stemMorpha(w, MorphaDotPOS);
String pos = "";
switch (MorphaDotPOS) {
case Morpha.noun:
pos = "n";
break;
case Morpha.verb:
pos = "v";
break;
}
Pair<String, Set<String>> inWN = null;
try {
inWN = WNWords.inWN(stemmed);
} catch (Exception e) {
e.printStackTrace();
}
if (inWN == null)
return w;
if (inWN.second == null || inWN.second.isEmpty())
return w;
if (pos.isEmpty())
return stemmed;
if (inWN.second.contains(pos))
return stemmed;
else
return w;
}
}
private static int countChar(String s, char ec, boolean shouldTrim) {
int count = 0;
if (shouldTrim)
s = s.trim();
for (char c : s.toCharArray()) {
if (c == ec)
count++;
}
return count;
}
/*************************************************************************
* That is, it only does noun plurals, pronoun case, and verb endings, and
* not things like comparative adjectives or derived nominals. It is based
* on a finite-state transducer implemented by John Carroll et al., written
* in flex and publicly available. See:
* http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html .
*
* @param w
* e.g. fighter jets
* @param morphaPOSNum
* (Use Morpha. static members) e.g. 2 (for noun), 3 (for any)
* @return fighter jet (note that fighters jets returns fighters jet
* @usage Util.stem("goes", Morpha.verb);
************************************************************************/
private static String stemMorpha(String w, int morphaPOSNum) {
try {
if (w == null || w.length() == 0)
return w;
int numWords = countChar(w, ' ', false) + 1;
String[] ws = null;
if (numWords > 1)
ws = w.split(" ");
if (lexer == null)
lexer = new Morpha(System.in);
lexer.yyreset(new StringReader(numWords == 1 ? w
: ws[ws.length - 1]));
lexer.yybegin(morphaPOSNum);
if (numWords == 1)
return lexer.next();
else {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < ws.length - 1; i++)
sb.append(ws[i]).append(" ");
sb.append(lexer.next());
return sb.toString();
}
} catch (Exception e) {
/*
* System.out.println("Exception in stemming (" + w + "): " +
* e.getMessage());
*/
// e.printStackTrace();
} catch (Error e) {
// Sometimes Morpha throws Error!
// Exception in thread "main" java.lang.Error: Error: could not
// match input
/*
* System.out.println("Error in stemming (" + w + "): " +
* e.getMessage());
*/
// e.printStackTrace();
}
return w;
}
static Morpha lexer;
public static Collection<String> copularVerbs = new HashSet<>(Arrays
.asList(new String[] {"be", "has", "have", "had", "is", "was", "are",
"were"}));
public static Collection<String> articles = new HashSet<String>(Arrays
.asList(new String[] {"a", "an", "the", "your", "my", "our", "his",
"her"}));
public static Collection<String> prepositions = new HashSet<String>(Arrays
.asList(new String[] {"in", "on", "at", "with", "into", "across",
"opposite", "toward", "towards", "through", "beyond", "aboard",
"amid", "past", "by", "near", "nearby", "above", "below", "over",
"under", "up", "down", "around", "through", "inside", "out",
"outside", "outside of", "between", "beside", "besides", "beyond",
"in front of", "in back of", "behind", "next to", "on top of",
"within", "beneath", "underneath", "among", "along", "against",
"aboard", "about", "above", "across", "after", "against", "along",
"amid", "among", "anti", "around", "as", "at", "before", "behind",
"below", "beneath", "beside", "besides", "between", "beyond",
"but", "by", "concerning", "considering", "despite", "down",
"during", "except", "excepting", "excluding", "following", "for",
"from", "in", "inside", "into", "in front of", "like", "minus",
"near", "of", "off", "on", "onto", "opposite", "outside", "over",
"past", "per", "plus", "regarding", "round", "save", "since",
"than", "through", "to", "toward", "towards", "under",
"underneath", "unlike", "until", "up", "upon", "versus", "via",
"with", "within", "without"}));
public static Collection<String> MODAL_VERBS = (new HashSet<String>(Arrays
.asList(new String[] {"can", "could", "may", "might", "will", "would",
"must", "shall", "should", "ought to"})));
public static Collection<String> STOPWORDS = (new HashSet<String>(Arrays
.asList(new String[] {"a", "able", "about", "across", "after", "all",
"almost", "also", "always", "am", "among", "an", "and", "another",
"any", "are", "as", "at", "be", "because", "been", "before",
"being", "but", "by", "can", "cannot", "could", "dear", "did",
"do", "does", "either", "else", "ever", "every", "few", "for",
"from", "get", "got", "had", "has", "have", "he", "her", "here",
"hers", "him", "his", "how", "however", "i", "if", "in", "into",
"is", "it", "its", "just", "least", "let", "like", "likely", "lrb",
"many", "may", "me", "might", "mine", "more", "most", "much",
"must", "my", "neither", "no", "none", "nor", "not", "nothing",
"now", "nt", "of", "off", "often", "on", "only", "or", "other",
"our", "ours", "own", "per", "rather", "rrb", "said", "say",
"says", "she", "should", "since", "so", "some", "somehow", "still",
"such", "than", "that", "the", "their", "theirs", "them", "then",
"there", "these", "they", "this", "those", "though", "tis", "to",
"too", "twas", "u", "us", "very", "want", "wants", "was", "we",
"were", "what", "when", "where", "which", "while", "who", "whom",
"why", "will", "with", "would", "www", "yet", "you", "your",
"yours", "yourss", "'m", "'ll", "a", "about", "above", "after",
"again", "against", "all", "am", "an", "and", "any", "are", "as",
"at", "be", "because", "been", "before", "being", "below",
"between", "both", "but", "by", "cannot", "could", "did", "do",
"does", "doing", "down", "during", "each", "few", "for", "from",
"further", "had", "has", "have", "having", "he", "her", "here",
"hers", "herself", "him", "himself", "his", "how", "however", "i",
"if", "in", "into", "is", "it", "its", "itself", "let", "lrb",
"me", "more", "most", "must", "my", "myself", "no", "nor", "not",
"of", "off", "on", "once", "only", "or", "other", "ought", "our",
"ours ourselves", "out", "over", "own", "rrb", "same", "sha",
"she", "should", "so", "some", "such", "than", "that", "the",
"their", "theirs", "them", "themselves", "then", "there", "these",
"they", "this", "those", "through", "to", "too", "under", "until",
"up", "very", "was", "we", "were", "what", "when", "where",
"which", "while", "who", "who", "whom", "why", "why", "with", "wo",
"would", "would", "you", "you", "you", "you", "you", "your",
"yours", "yourself", "yourselves"})));
/** returns position of headverb from headverbCandidate s. */
public static int headVerb(String[] s) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
String w = s[i];
// begin to peel (here, peel = index 1, i.e. succeeding "to")
if (prepositions.contains(w) && sb.length() > 0
) {
// prep. not last word e.g. heat up
if (i != s.length - 1)
return i + 1;
/*
* else if return 0;
*/
}
sb.append(sb.length() == 0 ? "" : " ").append(w);
}
// had turned
if (s.length > 1
&& (copularVerbs.contains(s[0]) || MODAL_VERBS.contains(s[0]) || STOPWORDS
.contains(s[0]))) {
return 1 + headVerb(Arrays.copyOfRange(s, 1, s.length)); }
// Fallback to last position case when no prepositions are present.
// continue peeling
if (s[s.length - 1].endsWith("ing")) // present tense.
return s.length - 1;
else
// take out
/*
* return verbsInCorpus != null && verbsInCorpus.contains(s[0]) ? 0
* : s.length - 1;
*/
return 0;
}
private static enum IrregularPlurals {
PLURAL();
private final Map<String, String> pluralToSingularMap;
private IrregularPlurals() {
this.pluralToSingularMap = loadPlurals();
}
private Map<String, String> loadPlurals() {
Map<String, String> mapping = new HashMap<>();
// women woman
try {
String[] splitted;
for (String line : new FileLines("./data/irregular-plurals.txt")) {
splitted = line.split("\t");
mapping.put(splitted[0], splitted[1]);
}
} catch (Exception e) {}
return mapping;
}
public String getSingular(String pluralNoun) {
return PLURAL.pluralToSingularMap.get(pluralNoun);
}
}
}