Skip to content
Permalink
3b99e145f5
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1923 lines (1764 sloc) 56 KB
package util;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import util.FileLines;
public class IDHelper {
private static AutoMap<Integer, String> adjwordidToWps; // 29873 ->
// lazy#a#1
private static AutoMap<Integer, String> nounwordidToWps; // 29873 ->
// human#n#1
private static AutoMap<Integer, Set<String>> ssidToWps; // 304778653
// ->
// lazy#a#1, slow#a#5
// ...
private static AutoMap<String, Set<String>> nounwordToWps; // car ->
// car#n#1,
// car#n#2 ...
private static AutoMap<String, Set<String>> adjwordToWps; // sweet ->
// sweet#a#1
// ...
private static HashMap<wps, IDSubjObj> wpsToID; // lazy#a#1
// ->
// 304778653,
// 29873
private static HashMap<Integer, String> rawNounIDTorawNoun; // 901 ->
// apple
private static HashMap<Integer, String> rawAdjIDTorawAdj; // 901 ->
// deft
private static HashMap<Integer, Integer> nounidToSensenum;
private static HashMap<Integer, Integer> adjidToSensenum;
private static Map<Integer, wps> synsetToBestwps;
private static Map<Integer, Integer> synsetTagCount;
public static void mainToDumpIDs(String[] args) throws Exception {
String glossQ = "select * from wordnet.wn_glosses";
HashMap<Integer, String> allGlosses =
new HashMap<Integer, String>(130000);
ResultSet glossRS = DBConnector.q(glossQ);
while (glossRS.next())
allGlosses.put(glossRS.getInt(1), glossRS.getString(2).replaceAll(
"\t", " "));
System.out.println("Fetching glosses done. ");
// ////////////////////////////////////////////////////////////
/*
* wordid | integer | synsetid | integer | word | character varying |
* pos | character varying | sensenum | integer | wordpossense |
* character varying |
*/
String nnIDQ = "select * from nounid_bk";
Set<String> outNN = new HashSet<String>(150000);
ResultSet nnRS = DBConnector.q(nnIDQ);
while (nnRS.next()) {
int wordID = nnRS.getInt(1);
int synsetID = nnRS.getInt(2);
String word = nnRS.getString(3);
String pos = nnRS.getString(4);
int senseNum = nnRS.getInt(5);
String wordpossense = nnRS.getString(6);
String gloss = allGlosses.get(synsetID);
outNN.add(new StringBuilder(1000).append(wordID).append("\t")
.append(synsetID).append("\t").append(word).append("\t")
.append(pos).append("\t").append(senseNum).append("\t").append(
wordpossense).append("\t").append(gloss).toString());
}
String outFileNN = "./data/wnnoun.id2s";
System.out.println("Writing nouns to " + outFileNN);
Util.writeFile(outFileNN, outNN, false);
String adjIDQ = "select * from adjid_bk";
Set<String> outADJ = new HashSet<String>(50000);
ResultSet adjRS = DBConnector.q(adjIDQ);
while (adjRS.next()) {
int wordID = adjRS.getInt(1);
int synsetID = adjRS.getInt(2);
String word = adjRS.getString(3);
String pos = adjRS.getString(4);
int senseNum = adjRS.getInt(5);
String wordpossense = adjRS.getString(6);
String gloss = allGlosses.get(synsetID);
outADJ.add(new StringBuilder(1000).append(wordID).append("\t")
.append(synsetID).append("\t").append(word).append("\t")
.append(pos).append("\t").append(senseNum).append("\t").append(
wordpossense).append("\t").append(gloss).toString());
}
String outFileADJ = "./data/wnadj.id2s";
System.out.println("Writing adjectives to " + outFileADJ);
Util.writeFile(outFileADJ, outADJ, false);
// DBConnector.getDB().close();
}
public static int getTagcount(int synsetid) throws SQLException,
IOException {
if (synsetTagCount == null)
getwpsMostFrequentWord(synsetid);
return synsetTagCount.containsKey(synsetid) ? synsetTagCount
.get(synsetid) : 0;
}
public static wps getwpsMostFrequentWord(int synsetid) throws SQLException,
IOException {
if (synsetToBestwps == null) {
String sql = "select word,synset_id,tag_count from wordnet.wn_syn";
ResultSet rs = DBConnector.q(sql);
synsetToBestwps = new HashMap<>();
synsetTagCount = new HashMap<>();
while (rs.next()) {
wps wps =
getwps(rs.getInt(2), rs.getString(1).replace(' ', '_'));
if (wps != null)
synsetToBestwps.put(rs.getInt(2), wps);
synsetTagCount.put(rs.getInt(2), rs.getInt(3));
}
}
return synsetToBestwps.containsKey(synsetid) ? synsetToBestwps
.get(synsetid) : null;
}
/** rawid 901 -> apple */
public static String getRawWord(int rawID, boolean isNoun)
throws SQLException {
String returnW = "";
if (!isNoun && rawAdjIDTorawAdj == null) {
rawAdjIDTorawAdj = new HashMap<Integer, String>();
String sql = "select wordid, word from rawadjid";
ResultSet rs = DBConnector.q(sql);
while (rs.next())
rawAdjIDTorawAdj.put(rs.getInt(1), rs.getString(2));
}
if (isNoun && rawNounIDTorawNoun == null) {
rawNounIDTorawNoun = new HashMap<Integer, String>();
String sql = "select wordid, word from rawnounid";
ResultSet rs = DBConnector.q(sql);
while (rs.next())
rawNounIDTorawNoun.put(rs.getInt(1), rs.getString(2));
}
returnW =
isNoun ? rawNounIDTorawNoun.get(rawID) : rawAdjIDTorawAdj
.get(rawID);
return returnW;
}
/**
* 304778653 -> lazy#a#1, slow#a#5 ...
*
* @throws IOException
*/
public static Set<wps> getwps(int synsetid) throws SQLException,
IOException {
Set<wps> wpsList = new HashSet<IDHelper.wps>();
if (ssidToWps == null) {
ssidToWps = new AutoMap<>();
loadAdjNounVerbIDs();
}
Set<String> rawwps = ssidToWps.get(synsetid);
for (String raw : Util.nullableIter(rawwps))
wpsList.add(new wps(raw));
return wpsList;
}
private static void loadAdjNounVerbIDs() throws IOException, SQLException {
loadAdjID();
loadNounID();
loadVerbID();
loadAdverbID();
}
/**
* Load adjid from local file or database
* @param tbName adjid
* @throws IOException
* @throws SQLException
*/
private static void loadAdjID() throws IOException, SQLException {
String tbName = "adjid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
ssidToWps.addSetValue(Integer.parseInt(fields[1]), fields[5]);
}
} else {
String adjIDQ = "select wordpossense, synsetid from adjid";
ResultSet adjRS = DBConnector.q(adjIDQ);
while (adjRS.next()) {
ssidToWps.addSetValue(adjRS.getInt(2), adjRS.getString(1));
}
}
}
/**
* Load nouns from local file or database
* @param tbName nounid
* @throws IOException
* @throws SQLException
*/
private static void loadNounID() throws IOException, SQLException {
String tbName = "nounid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
ssidToWps.addSetValue(Integer.parseInt(fields[1]), fields[5]);
}
} else {
String nnIDQ = "select wordpossense, synsetid from nounid";
ResultSet nnRS = DBConnector.q(nnIDQ);
while (nnRS.next()) {
ssidToWps.addSetValue(nnRS.getInt(2), nnRS.getString(1));
}
}
}
/**
* Load verbs from local file or database
* @param tbName nounid
* @throws IOException
* @throws SQLException
*/
private static void loadVerbID() throws IOException, SQLException {
String tbName = "verbid";
char pos = 'v';
loadPOSID(tbName, pos);
}
/**
* Load adverbs from local file or database
* @param tbName adverbid
* @throws IOException
* @throws SQLException
*/
private static void loadAdverbID() throws IOException, SQLException {
String tbName = "adverbid";
char pos = 'r';
loadPOSID(tbName, pos);
}
private static void loadPOSID(String tbName, char pos)
throws NumberFormatException, FileNotFoundException, SQLException {
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
ssidToWps
.addSetValue(Integer.parseInt(fields[1 - 1]), new wps(
fields[3 - 1], pos, Integer.parseInt(fields[4 - 1]))
.toString());
}
} else {
String vIDQ =
"select t1.synset_id, gloss,lower(word),sense_number "
+ "from wordnet.wn_synsets t1, wordnet.wn_glosses t2 "
+ "where ss_type='" + pos
+ "' and t1.synset_id=t2.synset_id";
ResultSet vRS = DBConnector.q(vIDQ);
while (vRS.next()) {
ssidToWps.addSetValue(vRS.getInt(1), new wps(
vRS.getString(3), pos, vRS.getInt(4)).toString());
}
}
}
/**
* 304778653, slow -> slow#a#5 ...
*
* @throws IOException
*/
public static wps getwps(int synsetid, String rawword) throws SQLException,
IOException {
for (wps wpsCand : getwps(synsetid)) {
String wpsNo_ = wpsCand.w.replace('_', ' ');
if (wpsNo_.equalsIgnoreCase(rawword.replace("`", "'")))
return wpsCand;
else if (wpsNo_.equalsIgnoreCase(rawword.replace('_', ' ')))
return wpsCand;
else if (wpsNo_.equalsIgnoreCase(rawword.replace('_', ' ').replace(
"`", "'")))
return wpsCand;
}
return null;
}
/**
* car -> car#n#1, car#n#2 ... , sweet -> sweet#a#1 ... Since a word can be
* both adj and noun (e.g. red) thus the parameter isNoun is used
*
* @throws IOException
*/
public static Set<wps> getwps(String word, boolean isNoun)
throws SQLException, IOException {
Set<wps> wpsList = new HashSet<IDHelper.wps>();
if (nounwordToWps == null || adjwordToWps == null) {
Timer timer = new Timer();
System.out.print("\nInitializing word To Wps ...");
loadAdjWordToWps();
loadNounWordToWps();
timer.time();
}
if (isNoun && !nounwordToWps.containsKey(word.replaceAll(" ", "_"))) { return wpsList; } // if
// word
// absent.
if (!isNoun && !adjwordToWps.containsKey(word)) { return wpsList; } // if
// word
// absent.
Set<String> rawwps =
isNoun ? nounwordToWps.get(word.replaceAll(" ", "_"))
: adjwordToWps.get(word); // ex. of raw: "sports car#n#1"
// and
// not "sports_car#n#1"
for (String raw : rawwps)
wpsList.add(new wps(raw));
return wpsList;
}
/**
* Load noun word to wps mapping
* @throws IOException
* @throws SQLException
*/
private static void loadNounWordToWps() throws IOException, SQLException {
nounwordToWps = new AutoMap<String, Set<String>>();
String tbName = "nounid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
nounwordToWps.addSetValue(fields[2], fields[5]);
}
} else {
String nnIDQ = "select word, wordpossense from nounid";
ResultSet nnRS = DBConnector.q(nnIDQ);
while (nnRS.next()) {
nounwordToWps.addSetValue(nnRS.getString(1), nnRS
.getString(2));
}
}
}
public static String cacheFile(String tbName) {
try {
String dbText =
"/var/tmp/cxchu/data-needed/db-text/";
String localFile = dbText + tbName;
return localFile ;
} catch (Exception e) {
return "";
}
}
/**
* Load adj word to wps mapping
* @throws IOException
* @throws SQLException
*/
private static void loadAdjWordToWps() throws IOException, SQLException {
adjwordToWps = new AutoMap<String, Set<String>>();
String tbName = "adjid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
adjwordToWps.addSetValue(fields[2], fields[5]);
}
} else {
String adjIDQ = "select word,wordpossense from adjid";
ResultSet adjRS = DBConnector.q(adjIDQ);
while (adjRS.next()) {
adjwordToWps.addSetValue(adjRS.getString(1), adjRS
.getString(2));
}
}
}
/**
* 29873 -> lazy#a#1 ...
*
* @throws IOException
*/
public static wps getwps(int wordid, boolean iswordIDDummy, boolean isNoun)
throws SQLException, IOException {
if (nounwordidToWps == null || adjwordidToWps == null) {
System.out.println("\nInitializing wordid To Wps ...");
adjwordidToWps = new AutoMap<Integer, String>();
loadAdjWordIDToWps();
nounwordidToWps = new AutoMap<Integer, String>();
loadNounWordIDToWps();
}
String rawwps =
isNoun ? nounwordidToWps.get(wordid) : adjwordidToWps.get(wordid);
if (rawwps == null || !rawwps.contains("#"))
return null;
return new wps(rawwps);
}
/**
* Load adj word id to wps mapping
* @throws IOException
* @throws SQLException
*/
private static void loadAdjWordIDToWps() throws IOException, SQLException {
String tbName = "adjid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
adjwordidToWps.put(Integer.parseInt(fields[0]), fields[5]);
}
} else {
String adjIDQ = "select wordpossense, wordid from adjid";
ResultSet adjRS = DBConnector.q(adjIDQ);
while (adjRS.next()) {
adjwordidToWps.put(adjRS.getInt(2), adjRS.getString(1));
}
}
}
/**
* Load noun word id to wps mapping
* @throws IOException
* @throws SQLException
*/
private static void loadNounWordIDToWps() throws IOException, SQLException {
String tbName = "nounid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
nounwordidToWps.put(Integer.parseInt(fields[0]), fields[5]);
}
} else {
String nnIDQ = "select wordpossense, wordid from nounid";
ResultSet nnRS = DBConnector.q(nnIDQ);
while (nnRS.next()) {
nounwordidToWps.put(nnRS.getInt(2), nnRS.getString(1));
}
}
}
public static int getsensenum(int wordid, boolean isNoun)
throws SQLException, IOException {
if (nounidToSensenum == null || adjidToSensenum == null) {
System.out.println("\nInitializing wordid To sensenum ...");
adjidToSensenum = new AutoMap<Integer, Integer>();
nounidToSensenum = new AutoMap<Integer, Integer>();
loadAdjIDToSensenum();
loadNounIDToSensenum();
}
if (wordid < 0)
return 0;
if (isNoun) {
if (!nounidToSensenum.containsKey(wordid))
return 0;
else
return nounidToSensenum.get(wordid);
} else {
if (!adjidToSensenum.containsKey(wordid))
return 0;
else
return adjidToSensenum.get(wordid);
}
}
/**
* Load adj word to sense number mapping
* @throws IOException
* @throws SQLException
*/
private static void loadAdjIDToSensenum() throws IOException, SQLException {
String tbName = "adjid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
adjidToSensenum.put(Integer.parseInt(fields[4]), Integer
.parseInt(fields[0]));
}
} else {
String adjIDQ = "select sensenum,wordid from adjid";
ResultSet adjRS = DBConnector.q(adjIDQ);
while (adjRS.next())
adjidToSensenum.put(adjRS.getInt(2), adjRS.getInt(1));
}
}
/**
* Load noun word to sense number mapping
* @throws IOException
* @throws SQLException
*/
private static void loadNounIDToSensenum() throws IOException, SQLException {
String tbName = "nounid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
nounidToSensenum.put(Integer.parseInt(fields[4]), Integer
.parseInt(fields[0]));
}
} else {
String nnIDQ = "select sensenum, wordid from nounid";
ResultSet nnRS = DBConnector.q(nnIDQ);
while (nnRS.next())
nounidToSensenum.put(nnRS.getInt(2), nnRS.getInt(1));
}
}
private static Map<String, Integer> numnounsenses;
private static Map<String, Integer> numadjsenses;
public static int countSenses(String wordNo_, boolean isNoun)
throws SQLException, IOException {
if (numnounsenses == null) {
numnounsenses = new HashMap<>();
loadNumNounSenses();
}
if (numadjsenses == null) {
numadjsenses = new HashMap<>();
loadNumAdjSenses();
}
if (isNoun)
return (numnounsenses.containsKey(wordNo_)) ? numnounsenses
.get(wordNo_) : 0;
else
return (numadjsenses.containsKey(wordNo_)) ? numadjsenses
.get(wordNo_) : 0;
}
private static void loadNumNounSenses() throws SQLException, IOException {
String tbName = "__numsensenoun";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
// shrewishness 1
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
numnounsenses.put(fields[0], Integer.parseInt(fields[1]));
}
} else {
ResultSet rs = DBConnector.q("select word,n from __numsensenoun");
while (rs.next())
numnounsenses.put(rs.getString(1), rs.getInt(2));
}
}
private static void loadNumAdjSenses() throws SQLException, IOException {
String tbName = "__numsenseadj";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
// shrewishness 1
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
numadjsenses.put(fields[0], Integer.parseInt(fields[1]));
}
} else {
ResultSet rs = DBConnector.q("select word,n from __numsenseadj");
while (rs.next())
numadjsenses.put(rs.getString(1), rs.getInt(2));
}
}
private static Map<String, Integer> numwordsenses;
public static int countSenses(String wordNo_, POS pos) throws SQLException,
IOException {
if (numwordsenses == null) {
numwordsenses = new HashMap<>();
loadNumWordSenses();
}
String key = wordNo_ + "." + pos.getTag(); // cutting board.n
return (numwordsenses.containsKey(key)) ? numwordsenses.get(key) : 0;
}
private static void loadNumWordSenses() throws SQLException {
ResultSet rs =
DBConnector
.q("select word,ss_type,count(*) from (select word,case ss_type when 's' then 'a' else ss_type end,sense_number from wordnet.wn_synsets)a group by word, ss_type;");
while (rs.next())
numwordsenses.put(rs.getString(1).replace('_', ' ') + "."
+ rs.getString(2), rs.getInt(3));
}
private static AutoMap<String, AutoMap<String, Double>> nvCount;
// run => v or n? Which one has higher tag_count in wordnet.. verb phrase
// hence v
public static char mfsPOSFromWN(String w) throws NumberFormatException,
IOException {
if (nvCount == null || nvCount.isEmpty()) {
nvCount = new AutoMap<>();
String localFile = cacheFile("vnCount");
if (!localFile.isEmpty()) {
String[] fields;
// cripple n 1
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
AutoMap.addKeyKeyNumericValue(fields[0], nvCount,
fields[1], Double.parseDouble(fields[2]));
}
}
}
if (nvCount.containsKey(w)) {
AutoMap<String, Double> nAndV = nvCount.get(w);
double nCount = nAndV.get("n");
double vCount = nAndV.get("v");
if (vCount > nCount)
return 'v';
else if (nCount > vCount)
return 'n';
}
return ' ';
}
public static int getSenseFromKey(String key) throws SQLException,
IOException {
return new wps(key, false).s;
}
/**
* car -> car#n#1, car#n#2 ... , sweet -> sweet#a#1 ... Since a word can be
* both adj and noun (e.g. red) thus the parameter isNoun is used
*/
/*
* public static List<wps> getwps(String word,boolean isNoun) throws
* SQLException{ List<wps> wpsList = new ArrayList<IDHelper.wps>();
* if(nounwordToWps == null || adjwordToWps == null){
* System.out.println("Initializing word To Wps ..."); adjwordToWps = new
* AutoMap<String, List<String>>(); String adjIDQ =
* "select word,wordpossense from adjid"; ResultSet adjRS =
* DBConnector.q(adjIDQ); while (adjRS.next()){
* adjwordToWps.addSetValue(adjRS.getString(1), adjRS.getString(2)); }
* String nnIDQ = "select word, wordpossense from nounid"; nounwordToWps =
* new AutoMap<String, List<String>>(); ResultSet nnRS =
* DBConnector.q(nnIDQ); while (nnRS.next()){
* nounwordToWps.addSetValue(nnRS.getString(1), nnRS.getString(2)); } }
* if(isNoun) word = word.replaceAll(" ", "_"); if(isNoun &&
* !nounwordToWps.containsKey(word)){ return wpsList; } // if word absent.
* if(!isNoun && !adjwordToWps.containsKey(word)){ return wpsList; } // if
* word absent. // ex. of raw: "sports car#n#1" and not "sports_car#n#1": //
* Note nounid contains no spaced words, so replace space with _
* List<String> rawwps = isNoun ? nounwordToWps.get(word) :
* adjwordToWps.get(word);
*
* for(String raw: Util.nullableIter(rawwps)) wpsList.add(new wps(raw));
* return wpsList; }
*//**
* 29873 -> lazy#a#1 ...
*
* @throws IOException
*/
/*
* public static wps getwps(int wordid,boolean iswordIDDummy,boolean isNoun)
* throws SQLException{ if(nounwordidToWps == null || adjwordidToWps ==
* null){ System.out.println("Initializing wordid To Wps ...");
* adjwordidToWps = new AutoMap<Integer, String>(); String adjIDQ =
* "select wordpossense, wordid from adjid"; ResultSet adjRS =
* DBConnector.q(adjIDQ); while (adjRS.next()){
* adjwordidToWps.put(adjRS.getInt(2), adjRS.getString(1)); }
* nounwordidToWps = new AutoMap<Integer, String>(); String nnIDQ =
* "select wordpossense, wordid from nounid"; ResultSet nnRS =
* DBConnector.q(nnIDQ); while (nnRS.next()){
* nounwordidToWps.put(nnRS.getInt(2), nnRS.getString(1)); } }
* if(nounwordidToWps == null || adjwordidToWps == null) return new wps("",
* 'a', 0); String rawwps = isNoun ? nounwordidToWps.get(wordid) :
* adjwordidToWps.get(wordid); if(rawwps == null || rawwps.length() == 0)
* return new wps("", 'a', 0); return new wps(rawwps); }
*/
public static void main(String[] args) throws IOException {
try {// "kill%2:35:00" "tumble-dryer%1:06:00::"
wps w = new wps("tumble-dryer%1:06:00::", false);
System.out.println(w);
/*
* System.out.println("adj <red,3>: " + getwps("red", false));
* System.out.println("noun <red,4>: " + getwps("red", true));
* System.out.println("adj <savory#a#3>: " + getwps(21911, true,
* false)); System.out.println("noun <sylvilagus_aquaticus#n#1>: " +
* getwps(21911, true, true));
*/
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* lazy#a#1 -> 304778653, 29873
*
* @throws IOException
*/
public static IDSubjObj idMeta(wps wps) throws SQLException, IOException {
if (wpsToID == null) {
Timer timer = new Timer();
System.out.print("\nInitializing wps To ID ...");
wpsToID = new HashMap<util.IDHelper.wps, IDSubjObj>();
loadAdjWpsToID();
loadNounWpsToID();
loadVerbWpsToID();
loadAdvWpsToID();
timer.time();
}
try {
if (wpsToID.containsKey(wps))
return wpsToID.get(wps);
} catch (Exception e) {}
// Not found or exception
return new IDSubjObj(-1, -1);
}
private static void loadNounWpsToID() throws SQLException, IOException {
String tbName = "nounid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
wpsToID.put(new wps(fields[5]), new IDSubjObj(Integer
.parseInt(fields[1]), Integer.parseInt(fields[0]),
fields[6]));
}
} else {
String nnIDQ =
"select wordpossense,synsetid, wordid, gloss from nounid";
ResultSet nnRS = DBConnector.q(nnIDQ);
while (nnRS.next()) {
wpsToID.put(new wps(nnRS.getString(1)), new IDSubjObj(nnRS
.getInt(2), nnRS.getInt(3), nnRS.getString(4)));
}
}
}
private static void loadAdjWpsToID() throws SQLException, IOException {
String tbName = "adjid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
wpsToID.put(new wps(fields[5]), new IDSubjObj(Integer
.parseInt(fields[1]), Integer.parseInt(fields[0]),
fields[6]));
}
} else {
String adjIDQ =
"select wordpossense,synsetid, wordid, gloss from adjid";
ResultSet adjRS = DBConnector.q(adjIDQ);
while (adjRS.next()) {
wpsToID.put(new wps(adjRS.getString(1)), new IDSubjObj(adjRS
.getInt(2), adjRS.getInt(3), adjRS.getString(4)));
}
}
}
private static void loadAdvWpsToID() throws SQLException, IOException {
String tbName = "advid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
wpsToID.put(new wps(fields[3 - 1], 'r', Integer
.parseInt(fields[4 - 1])), new IDSubjObj(Integer
.parseInt(fields[1 - 1]), -1, fields[2 - 1]));
}
} else {
String rIDQ =
"select t1.synset_id, gloss,lower(word),sense_number "
+ "from wordnet.wn_synsets t1, wordnet.wn_glosses t2 "
+ "where ss_type='r' and t1.synset_id=t2.synset_id";
ResultSet rRS = DBConnector.q(rIDQ);
while (rRS.next()) {
wpsToID.put(new wps(rRS.getString(3), 'r', rRS.getInt(4)),
new IDSubjObj(rRS.getInt(1), -1, rRS.getString(2)));
}
}
}
private static void loadVerbWpsToID() throws SQLException, IOException {
String tbName = "verbid";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
wpsToID.put(new wps(fields[3 - 1], 'v', Integer
.parseInt(fields[4 - 1])), new IDSubjObj(Integer
.parseInt(fields[1 - 1]), -1, fields[2 - 1]));
}
} else {
String vIDQ =
"select t1.synset_id, gloss,lower(word),sense_number "
+ "from wordnet.wn_synsets t1, wordnet.wn_glosses t2 "
+ "where ss_type='v' and t1.synset_id=t2.synset_id";
ResultSet vRS = DBConnector.q(vIDQ);
while (vRS.next()) {
wpsToID.put(new wps(vRS.getString(3), 'v', vRS.getInt(4)),
new IDSubjObj(vRS.getInt(1), -1, vRS.getString(2)));
}
}
}
public static IDSubjObj getTextMetaSlowly(int wordID, String tb)
throws SQLException {
String nnIDQ =
"select word,wordpossense,gloss from " + tb + " where wordid="
+ wordID;
ResultSet nnRS = DBConnector.q(nnIDQ);
if (nnRS.next()) { return new IDSubjObj(nnRS.getString("wordpossense"),
nnRS.getString("word"), nnRS.getString("gloss")); }
return null;
}
public static IDSubjObj getTextMetaSlowly(String nwordpossense, String tb)
throws SQLException {
if (tb.startsWith("noun") && nwordpossense.contains(" "))
nwordpossense = nwordpossense.replaceAll(" ", "_");
String nnIDQ =
"select word,gloss from " + tb + " where wordpossense='"
+ nwordpossense.replace("'", "''") + "'";
ResultSet nnRS = DBConnector.q(nnIDQ);
if (nnRS.next()) { return new IDSubjObj(nwordpossense, nnRS
.getString("word"), nnRS.getString("gloss")); }
return null;
}
public static class wps {
static final String spaceStr = " ";
static final String _Str = "_";
public static final String sharpStr = "#";
// sense key to wps.
private static final Pattern regex = Pattern
// .compile("^(\\p{ASCII}+)%(\\d):(\\d\\d):(\\d\\d)(:\\p{ASCII}+:(\\d\\d))?");
.compile("(\\p{ASCII}+)%(\\d):(\\d\\d):(\\d\\d).*?");
public String w;
public POS p;
public int s;
/**
* @param senseKey
* : lemma %ss_type:lex_filenum:lex_id:head_word:head_id <BR>
* lemma: underscored (_ ) word. <BR>
* ss_type: 1 NOUN, 2 VERB, 3 ADJECTIVE, 4 ADVERB, 5
* ADJECTIVE SATELLITE <BR>
* lex_filenum: e.g. 20 means noun.plant <BR>
* lex_id: two digit integer to uniquely identify sense num
* (!=sensenum) <BR>
* head_word: Usually blank. Only present for an adj
* satellite synset. It is the lemma of the first word of the
* satellite's head synset. head_id: Usually blank. Used to
* identify the satellite word.
*
* @param appendSatelliteColons
* mostly false because input is like
* geum_triflorum%1:20:00::
* @throws SQLException
* @throws IOException
*/
public wps(String senseKey, boolean appendSatelliteColons)
throws SQLException, IOException {
senseKey += appendSatelliteColons ? "::" : "";
// WordnetKey wnk = WordnetKey.parseKey(senseKey);
Matcher m = regex.matcher(senseKey);
int pid = 0;
if (m.matches()) {
// noah''s_flood -> noah's_flood
String lemma = m.group(1).replace("''", "'");
w = lemma;
pid = Integer.parseInt(m.group(2));
p = getPOSFromSenseKeyNum(pid);
}
s = getSenseNum2_1__FromSenseKey(senseKey);
// general%5:00:00:imprecise:00 = general%3:00:00:imprecise:00
// switch 3 to 5 or vice versa if s is -1
if (s < 0) {
if (pid == 5)
senseKey = senseKey.replace("%5:", "%3:");
else if (pid == 3)
senseKey = senseKey.replace("%3:", "%5:");
s = getSenseNum2_1__FromSenseKey(senseKey);
}
}
public boolean isInitialized() {
return w != null && !w.isEmpty() && s > 0 && p != null;
}
/*
* public wps(String senseKey, boolean appendSatelliteColons) throws
* SQLException, IOException { senseKey += appendSatelliteColons ? "::"
* : ""; // WordnetKey wnk = WordnetKey.parseKey(senseKey); Matcher m =
* regex.matcher(senseKey); if (m.matches()) { String lemma =
* m.group(1); w = lemma; p =
* getPOSFromSenseKeyNum(Integer.parseInt(m.group(2))); } s =
* getSenseNum2_1__FromSenseKey(senseKey);
*
* }
*/
private POS getPOSFromSenseKeyNum(int pid) {
switch (pid) {
case 1:
return POS.NN;
case 2:
return POS.VB;
case 3:
return POS.JJ;
case 4:
return POS.RB;
case 5:
return POS.JJ;
default:
break;
}
// Most unknowns are noun phrases.
return POS.NN;
}
private static Map<String, Integer> senseKeyWNSenseNum_2_1;
private int getSenseNum2_1__FromSenseKey(String senseKey)
throws SQLException, IOException {
if (senseKeyWNSenseNum_2_1 == null) {
loadSenseKeyWNSenseNum_2_1();
}
if (!senseKeyWNSenseNum_2_1.containsKey(senseKey))
return -1;
return senseKeyWNSenseNum_2_1.get(senseKey);
}
private void loadSenseKeyWNSenseNum_2_1() throws IOException,
SQLException {
senseKeyWNSenseNum_2_1 = new HashMap<>();
String tbName = "wordnet.wn_sensekeys";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
senseKeyWNSenseNum_2_1.put(fields[0], Integer
.parseInt(fields[2]));
}
} else {
String sql =
"select sensekey, sensenum from wordnet.wn_sensekeys";
ResultSet rs = DBConnector.q(sql);
while (rs.next()) {
senseKeyWNSenseNum_2_1.put(rs.getString(1), rs.getInt(2));
}
}
}
public wps(String w, char p, int s) {
constructwps(w, p, s);
}
private void constructwps(String w, char p, int s) {
this.w = w.toLowerCase();
this.s = s;
setTag(p);
if (wps.getTagConst(this.toString()) == 'n' && w.contains(spaceStr))
this.w = this.w.replace(' ', '_');
else if (wps.getTagConst(this.toString()) != 'n')
// adjectives, verb (e.g. set out) retain spaces.
this.w = this.w.replace('_', ' ');
}
/**
* @param wps
* bird#n#2
*/
public wps(String wps) {
try {
// wps.split("#")[0], wps.split("#")[1].charAt(0),
// Integer.parseInt(wps.split("#")[2])
String[] splitted = wps.split(sharpStr);
String w = splitted[0].toLowerCase();
char p = splitted[1].charAt(0);
if ((((int) p) >= 65 && ((int) p) <= 90))
p = (char) (p + 32);
int s = Integer.parseInt(splitted[2]);
// construct if valid
if (w != null && (((int) p) >= 97 && ((int) p) <= 122) && s > 0)
constructwps(w, p, s);
} catch (Exception e) {
this.w = null;
}
}
public static char getTagConst(String s) {
if (s == null)
return '_';
int firstSharpIdx = s.indexOf('#');
if (firstSharpIdx < 0)
return '_';
// good#a#1 ==> returns a.
// 01234567
return firstSharpIdx + 2 < s.length() ? s.charAt(firstSharpIdx + 1)
: '_';
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (!(obj instanceof wps))
return false;
final wps other = (wps) obj;
if (!w.equals(other.w))
return false;
if (!p.equals(other.p))
return false;
if (s != other.s)
return false;
return true;
}
@Override
public int hashCode() {
final int PRIME = 31;
int result = 1;
result = PRIME * result + w.hashCode();
result = PRIME * result + p.hashCode();
return result;
}
private void setTag(char pos) {
switch (pos) {
case 'a':
case 's':
p = POS.JJ;
break;
case 'v':
p = POS.VB;
break;
case 'r':
p = POS.RB;
break;
case 'n':
p = POS.NN;
break;
default:
break;
}
}
@Override
public String toString() {
return new StringBuilder(40).append(w).append('#').append(
p.getTag()).append('#').append(s).toString();
}
}
public static class IDSubjObj {
public String wordpossense;
public String word;
public String gloss;
public int synset;
public int wordID;
public IDSubjObj(int synset, int wordid) {
this("", "", "", synset, wordid);
}
public IDSubjObj(int synset, int wordid, String gloss) {
this("", "", gloss, synset, wordid);
}
public IDSubjObj(String wordpossense, String word, String gloss) {
this(wordpossense, word, gloss, -1, -1);
}
public IDSubjObj(String wordpossense, String word, String gloss,
int synset, int wordid) {
this.wordpossense = wordpossense;
this.word = word;
this.gloss = gloss;
this.synset = synset;
this.wordID = wordid;
}
}
public static class WNWords {
private static AutoMap<String, Set<String>> wnWordsNo_;
/**
* If the word is in wordnet returns list of pos it occurs with, else
* null
*
* @param w
* e.g. potato
* @return a: adjective, v: verb, n: noun, r: adverb
* @throws SQLException
* during initialization of wordnet (words, pos) map
* @throws IOException
*/
public static Pair<String, Set<String>> inWN(String w)
throws SQLException, IOException {
if (wnWordsNo_ == null) {
wnWordsNo_ = new AutoMap<>();
loadWnWordsNo_();
}
if (w == null)
return null;
w = w.toLowerCase();
// only check variations of two worded w e.g. horse shoe
// tolowercase Yunnan province
if (!wnWordsNo_.containsKey(w)) {
// horse-shoe
if (!w.contains(" "))
return null;
// w= horse shoe; wn has horseshoe or wn has horse-shoe
if (wnWordsNo_.containsKey(w.replace(" ", ""))) {
return new Pair<String, Set<String>>(w.replace(" ", ""),
wnWordsNo_.get(w.replace(" ", "")));
} else if (wnWordsNo_.containsKey(w.replace(' ', '-'))) {
return new Pair<String, Set<String>>(w.replace(" ", ""),
wnWordsNo_.get(w.replace(' ', '-')));
} else
// give up
return null;
} else
return new Pair<String, Set<String>>(w, wnWordsNo_.get(w));
}
private static void loadWnWordsNo_() throws IOException, SQLException {
String tbName = "wordnet.wn_synsets";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
String pos = fields[3].equals("s") ? "a" : fields[3];
wnWordsNo_.addSetValue(fields[2].toLowerCase(),
pos);
}
} else {
ResultSet rs =
DBConnector
.q("select lower(word), ss_type from wordnet.wn_synsets ");
while (rs.next()) {
String pos =
rs.getString(2).equals("s") ? "a" : rs.getString(2);
wnWordsNo_.addSetValue(rs.getString(1), pos);
}
}
}
private static AutoMap<String, Set<Integer>> wnWordSynsets;
public static Set<Integer> inWN(String w, POS pos)
throws SQLException, IOException {
if (wnWordSynsets == null) {
wnWordSynsets = new AutoMap<>();
loadWnWordSynsets();
}
if (w == null)
return null;
w = w.toLowerCase().replace(' ', '_');
String wordAndPos = w + "-" + pos.getTag();
if (wnWordSynsets.containsKey(wordAndPos)) { return wnWordSynsets
.get(wordAndPos); }
return new HashSet<>();
}
private static void loadWnWordSynsets() throws SQLException,
IOException {
String tbName = "wordnet.wn_synsets";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
// 100001740 1 entity n 1 11
fields = line.split("\t");
String posType = fields[3].equals("s") ? "a" : fields[3];
wnWordSynsets.addSetValue(fields[2].toLowerCase()
.replace(' ', '_')
+ "-" + posType, Integer.parseInt(fields[0]));
}
} else {
ResultSet rs =
DBConnector
.q("select lower(word), ss_type, synset_id from wordnet.wn_synsets ");
while (rs.next()) {
String posType =
rs.getString(2).equals("s") ? "a" : rs.getString(2);
wnWordSynsets.addSetValue(rs.getString(1)
.replace(' ', '_').toLowerCase()
+ "-" + posType, rs.getInt(3));
}
}
}
public static List<Pair<Integer, Double>> inWN(String word,
boolean perPOS) throws IOException, SQLException {
if (perPOS)
return inWNFreqPerPos(word);
else
return inWNFreqOverall(word);
}
private static AutoMap<String, List<Pair<Integer, Double>>> wnFreqOverall;
private static List<Pair<Integer, Double>> inWNFreqOverall(String word)
throws SQLException, IOException {
if (wnFreqOverall == null) {
wnFreqOverall = new AutoMap<>();
loadWnFreqOverall();
}
return wnFreqOverall.get(word);
}
private static void loadWnFreqOverall() throws NumberFormatException,
IOException, SQLException {
String tbName = "wordfreq_overall";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty())
loadWnFreqOverallFromFile(localFile);
else
loadWnFreqOverallFromDB();
}
private static void loadWnFreqOverallFromDB() throws SQLException {
int rank = 1;
String currentWnWord = "";
String wnWord;
List<Pair<Integer, Double>> rankingScore = new ArrayList<>();
ResultSet rs =
DBConnector
.q("select synset_id,lower(word),ss_type,sense_number,tag_count from wordnet.wn_synsets order by lower(word), tag_count desc, ss_type, sense_number");
// 100945401 search n 1 14
while (rs.next()) {
wnWord = rs.getString(2).toLowerCase();
if (currentWnWord.isEmpty())
currentWnWord = wnWord;
if (!wnWord.equals(currentWnWord)) {
rank = 1;
if (!rankingScore.isEmpty())
wnFreqOverall.put(currentWnWord, rankingScore);
rankingScore = new ArrayList<>();
currentWnWord = wnWord;
}
rankingScore.add(new Pair<Integer, Double>(rs.getInt(1),
computeRankScore(rank)));
rank++;
}
wnFreqOverall.put(currentWnWord, rankingScore);
}
private static void loadWnFreqOverallFromFile(String localFile)
throws NumberFormatException, IOException {
int rank = 1;
String currentWnWord = "";
String wnWord;
List<Pair<Integer, Double>> rankingScore = new ArrayList<>();
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
wnWord = fields[1];
if (currentWnWord.isEmpty())
currentWnWord = wnWord;
if (!wnWord.equals(currentWnWord)) {
rank = 1;
if (!rankingScore.isEmpty())
wnFreqOverall.put(currentWnWord, rankingScore);
rankingScore = new ArrayList<>();
currentWnWord = wnWord;
}
rankingScore.add(new Pair<Integer, Double>(Integer
.parseInt(fields[0]), computeRankScore(rank)));
rank++;
}
wnFreqOverall.put(currentWnWord, rankingScore);
}
private static AutoMap<String, List<Pair<Integer, Double>>> wnFreqPerPos;
private static List<Pair<Integer, Double>> inWNFreqPerPos(String word)
throws IOException, SQLException {
if (wnFreqPerPos == null) {
wnFreqPerPos = new AutoMap<>();
loadWNFreqPerPos();
}
return wnFreqPerPos.get(word);
}
private static void loadWNFreqPerPos() throws IOException, SQLException {
String tbName = "wordfreq_perpos";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty())
loadWNFreqPerPosFromFile(localFile);
else
loadWNFreqPerPosFromDB();
}
private static void loadWNFreqPerPosFromDB() throws SQLException {
int rank = 1;
String currentWnWord = "";
String currentPosTag = "";
String wnWord, wnPosTag;
int synset;
List<Pair<Integer, Double>> rankingScore = new ArrayList<>();
ResultSet rs =
DBConnector
.q("select synset_id,lower(word),ss_type,sense_number,tag_count from wordnet.wn_synsets order by lower(word), ss_type, tag_count desc, sense_number");
// 0 n 1 20
while (rs.next()) {
synset = rs.getInt(1);
wnWord = rs.getString(2).toLowerCase();
wnPosTag = rs.getString(3);
if (currentWnWord.isEmpty())
currentWnWord = wnWord;
if (currentPosTag.isEmpty())
currentPosTag = wnPosTag;
if (!wnWord.equals(currentWnWord)
|| !wnPosTag.equals(currentPosTag)) {
rank = 1;
if (!rankingScore.isEmpty()
&& !wnWord.equals(currentWnWord)) {
wnFreqPerPos.put(currentWnWord, rankingScore);
rankingScore = new ArrayList<>();
currentWnWord = wnWord;
}
if (!wnPosTag.equals(currentPosTag))
currentPosTag = wnPosTag;
}
rankingScore.add(new Pair<Integer, Double>(synset,
computeRankScore(rank)));
rank++;
}
wnFreqPerPos.put(currentWnWord, rankingScore);
}
private static void loadWNFreqPerPosFromFile(String localFile)
throws NumberFormatException, IOException {
int rank = 1;
String currentWnWord = "";
String currentPosTag = "";
String wnWord, wnPosTag;
int synset;
List<Pair<Integer, Double>> rankingScore = new ArrayList<>();
String[] fields;
// 100945401 search n 1 14
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
synset = Integer.parseInt(fields[0]);
wnWord = fields[1].toLowerCase();
wnPosTag = fields[1];
// TODO put this in a function
if (currentWnWord.isEmpty())
currentWnWord = wnWord;
if (currentPosTag.isEmpty())
currentPosTag = wnPosTag;
if (!wnWord.equals(currentWnWord)
|| !wnPosTag.equals(currentPosTag)) {
rank = 1;
if (!rankingScore.isEmpty()
&& !wnWord.equals(currentWnWord)) {
wnFreqPerPos.put(currentWnWord, rankingScore);
rankingScore = new ArrayList<>();
currentWnWord = wnWord;
}
if (!wnPosTag.equals(currentPosTag))
currentPosTag = wnPosTag;
}
rankingScore.add(new Pair<Integer, Double>(synset,
computeRankScore(rank)));
rank++;
}
wnFreqPerPos.put(currentWnWord, rankingScore);
}
private static Double computeRankScore(int rank) {
return 1.0 / (2.0 + (double) rank);
}
private static AutoMap<String, Set<Character>> wnWordTypes;
public static Set<Character> getWNWordTypes(String w)
throws IOException, SQLException {
if (wnWordTypes == null) {
wnWordTypes = new AutoMap<>();
loadWNTypes();
}
if (w == null)
return null;
w = w.toLowerCase();
if (!wnWordTypes.containsKey(w))
return null;
return wnWordTypes.get(w);
}
private static void loadWNTypes() throws SQLException, IOException {
String tbName = "ngram.wntypes_tree";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
wnWordTypes.addSetValue(fields[0].replace('_',
' '), fields[4].charAt(0));
}
} else {
ResultSet rs =
DBConnector
.q("select w,s,wps,synsetid,type from ngram.wntypes_tree");
while (rs.next()) {
/*
* String pos = rs.getString(5).equals("s") ? "a" : rs
* .getString(5);
*/
wnWordTypes.addSetValue(rs.getString(1).replace(
'_', ' '), rs.getString(5).charAt(0));
}
}
}
private static AutoMap<String, Character> wnWpsTypes;
/**
* Get WordNet noun type {p=physical, a=abstract, i=instance}
*
* @param wps
* @return
* @throws IOException
* @throws SQLException
*/
public static Character getWNWpsTypes(String wps) throws IOException,
SQLException {
if (wnWpsTypes == null) {
wnWpsTypes = new AutoMap<>();
loadWNWpsTypes();
}
if (wps == null)
return null;
wps = wps.toLowerCase();
if (wnWpsTypes.containsKey(wps))
return wnWpsTypes.get(wps);
else {
wps = wps.replace('_', ' ');
if (wnWpsTypes.containsKey(wps))
return wnWpsTypes.get(wps);
}
return null;
}
private static void loadWNWpsTypes() throws IOException, SQLException {
String tbName = "ngram.wntypes_tree";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
for (String line : new FileLines(localFile)) {
String[] fields = line.split("\t");
wnWpsTypes.put(fields[2].replace('_', ' '), fields[4]
.charAt(0));
}
} else {
ResultSet rs =
DBConnector
.q("select w,s,wps,synsetid,type from ngram.wntypes_tree");
while (rs.next()) {
/*
* String pos = rs.getString(5).equals("s") ? "a" : rs
* .getString(5);
*/
wnWpsTypes.put(rs.getString(3).replace('_', ' '), rs
.getString(5).charAt(0));
}
}
}
private static Set<String> wnFreqPhysical;
public static boolean isFreqPhysical(String w) throws IOException {
if (wnFreqPhysical == null) {
wnFreqPhysical = new HashSet<>();
String localFile = cacheFile("freq-phy-nouns");
if (!localFile.isEmpty()) {
for (String line : new FileLines(localFile)) {
if (line.startsWith("#"))
continue;
wnFreqPhysical.add(line);
}
}
}
if (w == null)
return false;
w = w.toLowerCase();
return wnFreqPhysical.contains(w);
}
private static Map<Integer, String> wnCommonWps;
public static String getCommonWpsOfSynset(int synset)
throws IOException, SQLException {
if (wnCommonWps == null) {
wnCommonWps = new HashMap<>();
loadCommonWpsOfSynset();
}
return wnCommonWps.containsKey(synset) ? wnCommonWps.get(synset)
: "";
}
private static void loadCommonWpsOfSynset() throws SQLException,
IOException {
String tbName = "wordnet.wn_syn";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
for (String line : new FileLines(localFile)) {
String[] splitted = line.split("\t");
int synsetId = Integer.parseInt(splitted[0]);
if (synsetId == 400232862) {
System.out
.println("DEBUG IDHelper.loadCommonWpsOfSynset");
}
wps wps = getwps(synsetId, splitted[1].replace(' ', '_'));
if (wps != null)
wnCommonWps.put(synsetId, wps.toString());
}
} else {
ResultSet rs =
DBConnector
.q("select synset_id,word,tag_count from wordnet.wn_syn");
while (rs.next()) {
int synsetId = rs.getInt(1);
wps wps =
getwps(synsetId, rs.getString(2).replace(' ', '_'));
if (wps != null)
wnCommonWps.put(synsetId, wps.toString());
}
}
}
/**
* Check if w is instance in WN for example Texas, Los Angeles
*
* @param w
* @return boolean if w is an instance in WN
*/
public static boolean isAnInstance(String w) {
try {
Set<Character> types = IDHelper.WNWords.getWNWordTypes(w);
if (types == null)
return false;
else if (types.size() == 1 && types.contains('i'))
return true;
} catch (IOException | SQLException e) {}
return false;
}
/**
* Check if w is instance in WN for example Texas, Los Angeles
*
* @param w
* @return boolean if w is an instance in WN
*/
public static boolean isPhyAndNotAnInstance(String w) {
try {
Set<Character> types = IDHelper.WNWords.getWNWordTypes(w);
if (types == null)
return false;
else if (!types.contains('i') && types.contains('p'))
return true;
} catch (IOException | SQLException e) {}
return false;
}
private static AutoMap<String, Set<String>> wnDomain;
/**
* Obtain WordNet domain of particular word.
*
* @param w
* e.g. cut
* @return n_artifact
* @throws SQLException
* during initialization of wordnet (words, pos) map
* @throws IOException
*/
public static Set<String> getWNWordDomain(String w)
throws SQLException, IOException {
if (wnDomain == null) {
wnDomain = new AutoMap<>();
loadWNWordDomain();
}
if (w == null)
return null;
w = w.toLowerCase();
return wnDomain.get(w);
}
private static void loadWNWordDomain() throws SQLException, IOException {
String tbName = "wordnet.wn_domain";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
initAllTops(localFile);
// 100055793 absence without leave military 0.000123673
// military t act
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
char pos = getPosFromSynset(fields[0]);
// String top = (fields.length==7)? fields[7]:"";
String tops =
constructTops(pos, fields[1].trim(), fields[6].trim());
for (wps member : getwps(Integer.parseInt(fields[0])))
wnDomain.addSetValue(member.w.replace('_',
' '), tops);
}
} else {
initAllTops("");
ResultSet rs =
DBConnector
.q("select synset_id,word,prob_domain,prob_score,orig_domain,orig_classtype,tops from wordnet.wn_domain ");
while (rs.next()) {
char pos = getPosFromSynset(rs.getString(1));
String tops =
constructTops(pos, rs.getString(2), rs.getString(7));
for (wps member : getwps(rs.getInt(1)))
wnDomain.addSetValue(member.w.replace('_',
' '), tops);
}
}
}
private static AutoMap<Integer, Set<String>> wnDomainOfSynset;
private static Set<String> allTops;
private static void initAllTops(String path) throws SQLException,
FileNotFoundException {
if (allTops == null) {
allTops = new HashSet<>();
if (path.isEmpty()) {
ResultSet rs =
DBConnector
.q("select distinct(tops) from wordnet.wn_domain ");
while (rs.next())
if (!rs.getString(1).isEmpty())
allTops.add(rs.getString(1));
} else {
for (String line : new FileLines(path)) {
String[] fields = line.split("\t");
if (!fields[6].trim().isEmpty())
allTops.add(fields[6]);
}
}
}
}
/**
* Obtain WordNet domain of particular word.
*
* @param w
* e.g. cut
* @return n_artifact
* @throws SQLException
* during initialization of wordnet (words, pos) map
* @throws IOException
*/
public static Set<String> getWNSynsetDomain(int synsetId)
throws SQLException, IOException {
if (wnDomainOfSynset == null) {
wnDomainOfSynset = new AutoMap<>();
loadWNSynsetDomain();
}
return wnDomainOfSynset.get(synsetId);
}
private static void loadWNSynsetDomain() throws SQLException,
IOException {
String tbName = "wordnet.wn_domain";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
String[] fields;
initAllTops(localFile);
// 100055793 absence without leave military 0.000123673
// military t act
for (String line : new FileLines(localFile)) {
fields = line.split("\t");
char pos = getPosFromSynset(fields[0]);
// String top = (fields.length==7)? fields[7]:"";
String tops =
constructTops(pos, fields[1].trim(), fields[6].trim());
wnDomainOfSynset.addSetValue(Integer
.parseInt(fields[0]), tops);
}
} else {
initAllTops("");
ResultSet rs =
DBConnector
.q("select synset_id,word,prob_domain,prob_score,orig_domain,orig_classtype,tops from wordnet.wn_domain ");
while (rs.next()) {
char pos = getPosFromSynset(rs.getString(1));
String tops =
constructTops(pos, rs.getString(2), rs.getString(7));
wnDomainOfSynset.addSetValue(rs.getInt(1), tops);
}
}
}
private static AutoMap<Integer, Integer> wnBasicConcepts;
/**
* Obtain basic synset of a given synset. For example 101488038 (whale
* shark) to 101482330 (shark).
*
* @param w
* @return n_artifact
* @throws SQLException
* during initialization of wordnet (words, pos) map
* @throws IOException
*/
public static int getBasicSynset(int synset_id) throws SQLException,
IOException {
if (wnBasicConcepts == null) {
wnBasicConcepts = new AutoMap<>();
loadWNBasicConcepts();
}
if (wnBasicConcepts.containsKey(synset_id))
return wnBasicConcepts.get(synset_id);
else
return synset_id;
}
private static void loadWNBasicConcepts() throws IOException,
SQLException {
String tbName = "temp.basicconcepts";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
for (String line : new FileLines(localFile)) {
String[] fields = line.split("\t");
wnBasicConcepts.put(Integer.parseInt(fields[0]), Integer
.parseInt(fields[2]));
}
} else {
ResultSet rs =
DBConnector
.q("select synset,basic_synset from temp.basicconcepts");
while (rs.next())
wnBasicConcepts.put(rs.getInt(1), rs.getInt(2));
}
}
public static String getBasicConcept(String wps) throws SQLException,
IOException {
int synset = idMeta(new wps(wps)).synset;
if (synset > 0) {
int basicSynset = getBasicSynset(synset);
return WNWords.getCommonWpsOfSynset(basicSynset);
}
return "";
}
private static AutoMap<Integer, WNDomains> wnDomains;
/**
* Obtain basic synset of a given synset. For example 101488038 (whale
* shark) to 101482330 (shark).
*
* @param w
* @return n_artifact
* @throws SQLException
* during initialization of wordnet (words, pos) map
* @throws IOException
*/
public static WNDomains getCompleteWNDomains(int synset_id)
throws SQLException, IOException {
if (wnDomains == null) {
wnDomains = new AutoMap<>();
loadCompleteWNDomains();
}
if (wnDomains.containsKey(synset_id))
return wnDomains.get(synset_id);
else
return new WNDomains(synset_id);
}
private static void loadCompleteWNDomains() throws IOException,
SQLException {
String tbName = "wordnet.wn_domain";
String localFile = cacheFile(tbName);
if (!localFile.isEmpty()) {
for (String line : new FileLines(localFile)) {
String[] fields = line.split("\t");
int synset = Integer.parseInt(fields[0]);
wnDomains.put(synset, new WNDomains(synset, fields[1],
fields[2], Double.parseDouble(fields[3]), fields[4],
fields[5], fields[6]));
}
} else {
ResultSet rs =
DBConnector
.q("select synset_id,word,prob_domain,prob_score,orig_domain,orig_classtype,tops from wordnet.wn_domain");
while (rs.next())
wnDomains
.put(rs.getInt("synset_id"), new WNDomains(rs
.getInt("synset_id"), rs.getString("word"), rs
.getString("prob_domain"), rs
.getDouble("prob_score"), rs
.getString("orig_domain"), rs
.getString("orig_classtype"), rs.getString("tops")));
}
}
public static WNDomains getWNDomains(wps wps) throws SQLException,
IOException {
return getCompleteWNDomains(IDHelper.idMeta(wps).synset);
}
public static String getGlosses(String wpsList, String inputDelimiter,
String outputDelimiter) {
StringBuilder output = new StringBuilder();
for (String wps : wpsList.split(inputDelimiter)) {
output.append(output.length() > 0 ? outputDelimiter : "")
.append(getGloss(wps));
}
return output.toString();
}
public static String getGloss(String wps) {
try {
return idMeta(new wps(wps)).gloss;
} catch (Exception e) {
return "";
}
}
public static char getPosFromSynset(String synset) {
switch (synset.charAt(0)) {
case '1':
return 'n';
case '2':
return 'v';
case '3':
return 'a';
case '4':
return 'r';
default:
return ' ';
}
}
public static String constructTops(char pos, String word, String tops) {
if (tops.trim().isEmpty() && pos == 'n' && allTops.contains(word))
return pos + "_" + word;
else if ((pos == 'n' || pos == 'v') && !tops.trim().isEmpty())
return pos + "_" + tops;
else
return (pos + "").trim();
}
}
// Method to use function loadWordSynsetIDs:
// ResultSet rs = IDHelper.loadWordSynsetIDs();
// while (rs.next()) {
// System.out.println(rs.getString(1) + " -> " + rs.getString(2));
// }
public static ResultSet loadWordSynsetIDs() throws SQLException {
String wordSynsetIdsSql = "select word,string_agg(''||synset_id,',') from wordnet.wn_synsets group by word";
return DBConnector.q(wordSynsetIdsSql);
}
}