Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/kb/howtokb/utils/IDHelper.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
225 lines (192 sloc)
5.39 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.utils; | |
import java.io.IOException; | |
import java.sql.SQLException; | |
import java.util.HashSet; | |
import java.util.Set; | |
import edu.stanford.nlp.util.Pair; | |
/** | |
* | |
* @author ntandon | |
* | |
*/ | |
public class IDHelper { | |
public static String cacheFile(String tbName) { | |
try { | |
String dbText = | |
"/var/tmp/cxchu/data-needed/db-text/"; | |
String localFile = dbText + tbName; | |
return localFile ; | |
} catch (Exception e) { | |
return ""; | |
} | |
} | |
public static class WNWords { | |
private static AutoMap<String, Set<String>> wnWordsNo_; | |
/** | |
* If the word is in wordnet returns list of pos it occurs with, else | |
* null | |
* | |
* @param w | |
* e.g. potato | |
* @return a: adjective, v: verb, n: noun, r: adverb | |
* @throws SQLException | |
* during initialization of wordnet (words, pos) map | |
* @throws IOException | |
*/ | |
public static Pair<String, Set<String>> inWN(String w) | |
throws SQLException, IOException { | |
if (wnWordsNo_ == null) { | |
wnWordsNo_ = new AutoMap<>(); | |
loadWnWordsNo_(); | |
} | |
if (w == null) | |
return null; | |
w = w.toLowerCase(); | |
// only check variations of two worded w e.g. horse shoe | |
// tolowercase Yunnan province | |
if (!wnWordsNo_.containsKey(w)) { | |
// horse-shoe | |
if (!w.contains(" ")) | |
return null; | |
// w= horse shoe; wn has horseshoe or wn has horse-shoe | |
if (wnWordsNo_.containsKey(w.replace(" ", ""))) { | |
return new Pair<String, Set<String>>(w.replace(" ", ""), | |
wnWordsNo_.get(w.replace(" ", ""))); | |
} else if (wnWordsNo_.containsKey(w.replace(' ', '-'))) { | |
return new Pair<String, Set<String>>(w.replace(" ", ""), | |
wnWordsNo_.get(w.replace(' ', '-'))); | |
} else | |
// give up | |
return null; | |
} else | |
return new Pair<String, Set<String>>(w, wnWordsNo_.get(w)); | |
} | |
private static void loadWnWordsNo_() throws IOException, SQLException { | |
String tbName = "wordnet.wn_synsets"; | |
String localFile = cacheFile(tbName); | |
if (!localFile.isEmpty()) { | |
String[] fields; | |
for (String line : new FileLines(localFile)) { | |
fields = line.split("\t"); | |
String pos = fields[3].equals("s") ? "a" : fields[3]; | |
wnWordsNo_.addSetValue(fields[2].toLowerCase(), | |
pos); | |
} | |
} | |
} | |
private static AutoMap<String, Set<Character>> wnWordTypes; | |
public static Set<Character> getWNWordTypes(String w) | |
throws IOException, SQLException { | |
if (wnWordTypes == null) { | |
wnWordTypes = new AutoMap<>(); | |
loadWNTypes(); | |
} | |
if (w == null) | |
return null; | |
w = w.toLowerCase(); | |
if (!wnWordTypes.containsKey(w)) | |
return null; | |
return wnWordTypes.get(w); | |
} | |
private static void loadWNTypes() throws SQLException, IOException { | |
String tbName = "ngram.wntypes_tree"; | |
String localFile = cacheFile(tbName); | |
if (!localFile.isEmpty()) { | |
String[] fields; | |
for (String line : new FileLines(localFile)) { | |
fields = line.split("\t"); | |
wnWordTypes.addSetValue(fields[0].replace('_', | |
' '), fields[4].charAt(0)); | |
} | |
} | |
} | |
private static AutoMap<String, Character> wnWpsTypes; | |
/** | |
* Get WordNet noun type {p=physical, a=abstract, i=instance} | |
* | |
* @param wps | |
* @return | |
* @throws IOException | |
* @throws SQLException | |
*/ | |
public static Character getWNWpsTypes(String wps) throws IOException, | |
SQLException { | |
if (wnWpsTypes == null) { | |
wnWpsTypes = new AutoMap<>(); | |
loadWNWpsTypes(); | |
} | |
if (wps == null) | |
return null; | |
wps = wps.toLowerCase(); | |
if (wnWpsTypes.containsKey(wps)) | |
return wnWpsTypes.get(wps); | |
else { | |
wps = wps.replace('_', ' '); | |
if (wnWpsTypes.containsKey(wps)) | |
return wnWpsTypes.get(wps); | |
} | |
return null; | |
} | |
private static void loadWNWpsTypes() throws IOException, SQLException { | |
String tbName = "ngram.wntypes_tree"; | |
String localFile = cacheFile(tbName); | |
if (!localFile.isEmpty()) { | |
for (String line : new FileLines(localFile)) { | |
String[] fields = line.split("\t"); | |
wnWpsTypes.put(fields[2].replace('_', ' '), fields[4] | |
.charAt(0)); | |
} | |
} | |
} | |
private static Set<String> wnFreqPhysical; | |
public static boolean isFreqPhysical(String w) throws IOException { | |
if (wnFreqPhysical == null) { | |
wnFreqPhysical = new HashSet<>(); | |
String localFile = cacheFile("freq-phy-nouns"); | |
if (!localFile.isEmpty()) { | |
for (String line : new FileLines(localFile)) { | |
if (line.startsWith("#")) | |
continue; | |
wnFreqPhysical.add(line); | |
} | |
} | |
} | |
if (w == null) | |
return false; | |
w = w.toLowerCase(); | |
return wnFreqPhysical.contains(w); | |
} | |
/** | |
* Check if w is instance in WN for example Texas, Los Angeles | |
* | |
* @param w | |
* @return boolean if w is an instance in WN | |
*/ | |
public static boolean isAnInstance(String w) { | |
try { | |
Set<Character> types = IDHelper.WNWords.getWNWordTypes(w); | |
if (types == null) | |
return false; | |
else if (types.size() == 1 && types.contains('i')) | |
return true; | |
} catch (IOException | SQLException e) {} | |
return false; | |
} | |
/** | |
* Check if w is instance in WN for example Texas, Los Angeles | |
* | |
* @param w | |
* @return boolean if w is an instance in WN | |
*/ | |
public static boolean isPhyAndNotAnInstance(String w) { | |
try { | |
Set<Character> types = IDHelper.WNWords.getWNWordTypes(w); | |
if (types == null) | |
return false; | |
else if (!types.contains('i') && types.contains('p')) | |
return true; | |
} catch (IOException | SQLException e) {} | |
return false; | |
} | |
} | |
} |