Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKBClustering/src/utils/NLPUtils.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
405 lines (365 sloc)
13.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package utils; | |
import java.io.IOException; | |
import java.io.StringReader; | |
import java.sql.SQLException; | |
import java.util.Arrays; | |
import java.util.Collection; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.Map; | |
import java.util.Set; | |
import uk.ac.susx.informatics.Morpha; | |
import util.FileLines; | |
import util.Util; | |
import util.WordListEnum; | |
import util.IDHelper.WNWords; | |
import util.Pair; | |
public class NLPUtils { | |
/* | |
* public static String headWord(String s) { String[] words = s.split(" "); | |
* return words[headWord(words)]; } | |
*/ | |
public static String headWord(String s) throws SQLException, IOException { | |
return headWordStr(s.split(" ")); | |
} | |
/** returns position of headword. | |
* <b> In any case returns exactly one position! cannot return a phrase </b> | |
* @throws IOException | |
* @throws SQLException */ | |
public static int headWord(String[] s) { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = 0; i < s.length; i++) { | |
String w = s[i]; | |
// tower of hanoi (here, tower = index 0, i.e. preceding "of") | |
if (WordListEnum.PREPOSITIONS.contains(w) && sb.length() > 0) | |
if (WordListEnum.PLURAL_QUANTIFIER.contains(Util.rangeWords(0, | |
i, s))) | |
// couple of sketch pads => sketch pads (i.e. after preposition!) | |
return i + 1; | |
else | |
return i - 1; | |
sb.append(sb.length() == 0 ? "" : " ").append(w); | |
} | |
// Fallback to last position case when no prepositions are present. | |
return s.length - 1; | |
} | |
public static String headWordStr(String[] s) throws SQLException, | |
IOException { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = 0; i < s.length; i++) { | |
String w = s[i]; | |
// tower of hanoi (here, tower = index 0, i.e. preceding "of") | |
// couple of sketch pads => sketch pads (i.e. after preposition!) | |
if (WordListEnum.PREPOSITIONS.contains(w) && sb.length() > 0) { | |
// PLURAL_QUANTIFIER (e.g. couple of) | |
// couple of sketch pads => check from 0 to i (i.e. couple of) | |
if (WordListEnum.PLURAL_QUANTIFIER.contains(Util.rangeWords(0, | |
i, s))) { | |
// couple of sketch pads => sketch pads is a cand. wnhead | |
sb = new StringBuilder(); | |
sb.append(Util.rangeWords(i + 1, s.length - 1, s)); | |
} | |
// in any case break because we already filled the head cand. | |
break; | |
} | |
sb.append(sb.length() == 0 ? "" : " ").append(w); | |
} | |
// sketch pads => sketch pad. | |
// Verbs are handled separately via headVerb() func, | |
// hence stem using Morpha.any | |
return wnHeadWord(stem(sb.toString(), Morpha.any).split(" ")); | |
} | |
/** | |
* san francisco botanical garden -> botanical garden Loop over phrases in | |
* the sentence to find a valid WN noun phrase as the head words. | |
* | |
* @param s | |
* input phrase | |
* @return head words (valid WN noun phrases) | |
* @throws IOException | |
* @throws SQLException | |
*/ | |
private static String wnHeadWord(String[] s) throws SQLException, | |
IOException { | |
StringBuilder phrase; | |
for (int i = 0; i < s.length; i++) { | |
phrase = new StringBuilder(); | |
for (int j = i; j < s.length; j++) | |
phrase.append(phrase.length() > 0 ? " " : "").append(s[j]); | |
if (WNWords.inWN(phrase.toString()) != null) | |
return phrase.toString(); | |
} | |
// Fallback to last position case when no prepositions are present. | |
return s[s.length - 1]; | |
} | |
/************************************************************************ | |
* Stems only head noun (e.g. will stem "schools boxes" to "schools box") | |
* | |
* @param MorphaDotPOS | |
* morpha code : Morpha.noun , Morpha.verb , or Morpha.any | |
************************************************************************/ | |
public static String judiciousStemming(String phrase, int MorphaDotPOS) { | |
if (MorphaDotPOS == Morpha.noun) { return phrase; } | |
StringBuilder sb = new StringBuilder(); | |
String[] splitted = phrase.split(" "); | |
if (MorphaDotPOS == Morpha.noun) { | |
int headWordIndex = headWord(splitted); | |
// The blue darts to blue darts. | |
for (int i = 0; i < splitted.length; i++) { | |
// if (!Util.isStopWord(splitted[i])) | |
sb.append(i > 0 ? ' ' : ""); | |
if (i == headWordIndex) | |
sb.append(stem(splitted[i], MorphaDotPOS)); | |
else | |
sb.append(splitted[i]); | |
} | |
} else { | |
for (int i = 0; i < splitted.length; i++) { | |
sb.append(i > 0 ? ' ' : "").append( | |
stem(splitted[i], MorphaDotPOS)); | |
} | |
} | |
if (sb.length() == 0) | |
sb.append(sb.length() > 0 ? ' ' : "").append( | |
stem(splitted[splitted.length - 1], MorphaDotPOS)); | |
return sb.toString(); | |
// } | |
} | |
/**** | |
* Check if input phrase is eligible to be stemmed. (In short, we don't stem | |
* instances) | |
* | |
* @param phrase | |
* los_angeles -> false, boxes -> true | |
* @return | |
* @throws SQLException | |
* @throws IOException | |
*/ | |
public static String stem(String w, int MorphaDotPOS) { | |
String stemmed = null; | |
if (MorphaDotPOS == Morpha.noun) { | |
stemmed = IrregularPlurals.PLURAL.getSingular(w); | |
} | |
if (stemmed == null || stemmed.isEmpty()) | |
stemmed = stemExceptWN(w, MorphaDotPOS); | |
return stemmed; | |
} | |
private static String stemExceptWN(String w, int MorphaDotPOS) { | |
if (w.equals("_s")) | |
return w; // exception for _s which originally was 's | |
else { | |
String stemmed = stemMorpha(w, MorphaDotPOS); | |
String pos = ""; | |
switch (MorphaDotPOS) { | |
case Morpha.noun: | |
pos = "n"; | |
break; | |
case Morpha.verb: | |
pos = "v"; | |
break; | |
} | |
Pair<String, Set<String>> inWN = null; | |
try { | |
inWN = WNWords.inWN(stemmed); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
if (inWN == null) | |
return w; | |
if (inWN.second == null || inWN.second.isEmpty()) | |
return w; | |
if (pos.isEmpty()) | |
return stemmed; | |
if (inWN.second.contains(pos)) | |
return stemmed; | |
else | |
return w; | |
} | |
} | |
private static int countChar(String s, char ec, boolean shouldTrim) { | |
int count = 0; | |
if (shouldTrim) | |
s = s.trim(); | |
for (char c : s.toCharArray()) { | |
if (c == ec) | |
count++; | |
} | |
return count; | |
} | |
/************************************************************************* | |
* That is, it only does noun plurals, pronoun case, and verb endings, and | |
* not things like comparative adjectives or derived nominals. It is based | |
* on a finite-state transducer implemented by John Carroll et al., written | |
* in flex and publicly available. See: | |
* http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html . | |
* | |
* @param w | |
* e.g. fighter jets | |
* @param morphaPOSNum | |
* (Use Morpha. static members) e.g. 2 (for noun), 3 (for any) | |
* @return fighter jet (note that fighters jets returns fighters jet | |
* @usage Util.stem("goes", Morpha.verb); | |
************************************************************************/ | |
private static String stemMorpha(String w, int morphaPOSNum) { | |
try { | |
if (w == null || w.length() == 0) | |
return w; | |
int numWords = countChar(w, ' ', false) + 1; | |
String[] ws = null; | |
if (numWords > 1) | |
ws = w.split(" "); | |
if (lexer == null) | |
lexer = new Morpha(System.in); | |
lexer.yyreset(new StringReader(numWords == 1 ? w | |
: ws[ws.length - 1])); | |
lexer.yybegin(morphaPOSNum); | |
if (numWords == 1) | |
return lexer.next(); | |
else { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = 0; i < ws.length - 1; i++) | |
sb.append(ws[i]).append(" "); | |
sb.append(lexer.next()); | |
return sb.toString(); | |
} | |
} catch (Exception e) { | |
/* | |
* System.out.println("Exception in stemming (" + w + "): " + | |
* e.getMessage()); | |
*/ | |
// e.printStackTrace(); | |
} catch (Error e) { | |
// Sometimes Morpha throws Error! | |
// Exception in thread "main" java.lang.Error: Error: could not | |
// match input | |
/* | |
* System.out.println("Error in stemming (" + w + "): " + | |
* e.getMessage()); | |
*/ | |
// e.printStackTrace(); | |
} | |
return w; | |
} | |
static Morpha lexer; | |
public static Collection<String> copularVerbs = new HashSet<>(Arrays | |
.asList(new String[] {"be", "has", "have", "had", "is", "was", "are", | |
"were"})); | |
public static Collection<String> articles = new HashSet<String>(Arrays | |
.asList(new String[] {"a", "an", "the", "your", "my", "our", "his", | |
"her"})); | |
public static Collection<String> prepositions = new HashSet<String>(Arrays | |
.asList(new String[] {"in", "on", "at", "with", "into", "across", | |
"opposite", "toward", "towards", "through", "beyond", "aboard", | |
"amid", "past", "by", "near", "nearby", "above", "below", "over", | |
"under", "up", "down", "around", "through", "inside", "out", | |
"outside", "outside of", "between", "beside", "besides", "beyond", | |
"in front of", "in back of", "behind", "next to", "on top of", | |
"within", "beneath", "underneath", "among", "along", "against", | |
"aboard", "about", "above", "across", "after", "against", "along", | |
"amid", "among", "anti", "around", "as", "at", "before", "behind", | |
"below", "beneath", "beside", "besides", "between", "beyond", | |
"but", "by", "concerning", "considering", "despite", "down", | |
"during", "except", "excepting", "excluding", "following", "for", | |
"from", "in", "inside", "into", "in front of", "like", "minus", | |
"near", "of", "off", "on", "onto", "opposite", "outside", "over", | |
"past", "per", "plus", "regarding", "round", "save", "since", | |
"than", "through", "to", "toward", "towards", "under", | |
"underneath", "unlike", "until", "up", "upon", "versus", "via", | |
"with", "within", "without"})); | |
public static Collection<String> MODAL_VERBS = (new HashSet<String>(Arrays | |
.asList(new String[] {"can", "could", "may", "might", "will", "would", | |
"must", "shall", "should", "ought to"}))); | |
public static Collection<String> STOPWORDS = (new HashSet<String>(Arrays | |
.asList(new String[] {"a", "able", "about", "across", "after", "all", | |
"almost", "also", "always", "am", "among", "an", "and", "another", | |
"any", "are", "as", "at", "be", "because", "been", "before", | |
"being", "but", "by", "can", "cannot", "could", "dear", "did", | |
"do", "does", "either", "else", "ever", "every", "few", "for", | |
"from", "get", "got", "had", "has", "have", "he", "her", "here", | |
"hers", "him", "his", "how", "however", "i", "if", "in", "into", | |
"is", "it", "its", "just", "least", "let", "like", "likely", "lrb", | |
"many", "may", "me", "might", "mine", "more", "most", "much", | |
"must", "my", "neither", "no", "none", "nor", "not", "nothing", | |
"now", "nt", "of", "off", "often", "on", "only", "or", "other", | |
"our", "ours", "own", "per", "rather", "rrb", "said", "say", | |
"says", "she", "should", "since", "so", "some", "somehow", "still", | |
"such", "than", "that", "the", "their", "theirs", "them", "then", | |
"there", "these", "they", "this", "those", "though", "tis", "to", | |
"too", "twas", "u", "us", "very", "want", "wants", "was", "we", | |
"were", "what", "when", "where", "which", "while", "who", "whom", | |
"why", "will", "with", "would", "www", "yet", "you", "your", | |
"yours", "yourss", "'m", "'ll", "a", "about", "above", "after", | |
"again", "against", "all", "am", "an", "and", "any", "are", "as", | |
"at", "be", "because", "been", "before", "being", "below", | |
"between", "both", "but", "by", "cannot", "could", "did", "do", | |
"does", "doing", "down", "during", "each", "few", "for", "from", | |
"further", "had", "has", "have", "having", "he", "her", "here", | |
"hers", "herself", "him", "himself", "his", "how", "however", "i", | |
"if", "in", "into", "is", "it", "its", "itself", "let", "lrb", | |
"me", "more", "most", "must", "my", "myself", "no", "nor", "not", | |
"of", "off", "on", "once", "only", "or", "other", "ought", "our", | |
"ours ourselves", "out", "over", "own", "rrb", "same", "sha", | |
"she", "should", "so", "some", "such", "than", "that", "the", | |
"their", "theirs", "them", "themselves", "then", "there", "these", | |
"they", "this", "those", "through", "to", "too", "under", "until", | |
"up", "very", "was", "we", "were", "what", "when", "where", | |
"which", "while", "who", "who", "whom", "why", "why", "with", "wo", | |
"would", "would", "you", "you", "you", "you", "you", "your", | |
"yours", "yourself", "yourselves"}))); | |
/** returns position of headverb from headverbCandidate s. */ | |
public static int headVerb(String[] s) { | |
StringBuilder sb = new StringBuilder(); | |
for (int i = 0; i < s.length; i++) { | |
String w = s[i]; | |
// begin to peel (here, peel = index 1, i.e. succeeding "to") | |
if (prepositions.contains(w) && sb.length() > 0 | |
) { | |
// prep. not last word e.g. heat up | |
if (i != s.length - 1) | |
return i + 1; | |
/* | |
* else if return 0; | |
*/ | |
} | |
sb.append(sb.length() == 0 ? "" : " ").append(w); | |
} | |
// had turned | |
if (s.length > 1 | |
&& (copularVerbs.contains(s[0]) || MODAL_VERBS.contains(s[0]) || STOPWORDS | |
.contains(s[0]))) { | |
return 1 + headVerb(Arrays.copyOfRange(s, 1, s.length)); } | |
// Fallback to last position case when no prepositions are present. | |
// continue peeling | |
if (s[s.length - 1].endsWith("ing")) // present tense. | |
return s.length - 1; | |
else | |
// take out | |
/* | |
* return verbsInCorpus != null && verbsInCorpus.contains(s[0]) ? 0 | |
* : s.length - 1; | |
*/ | |
return 0; | |
} | |
private static enum IrregularPlurals { | |
PLURAL(); | |
private final Map<String, String> pluralToSingularMap; | |
private IrregularPlurals() { | |
this.pluralToSingularMap = loadPlurals(); | |
} | |
private Map<String, String> loadPlurals() { | |
Map<String, String> mapping = new HashMap<>(); | |
// women woman | |
try { | |
String[] splitted; | |
for (String line : new FileLines("./data/irregular-plurals.txt")) { | |
splitted = line.split("\t"); | |
mapping.put(splitted[0], splitted[1]); | |
} | |
} catch (Exception e) {} | |
return mapping; | |
} | |
public String getSingular(String pluralNoun) { | |
return PLURAL.pluralToSingularMap.get(pluralNoun); | |
} | |
} | |
} |