Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
HowToKB/src/kb/howtokb/clustering/sim/StringSimilarity.java
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
224 lines (200 sloc)
6.15 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package kb.howtokb.clustering.sim; | |
import java.io.IOException; | |
import java.sql.SQLException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import kb.howtokb.clustering.sim.w2v.Word2VecRunner; | |
import kb.howtokb.tools.NormalizationText; | |
public class StringSimilarity { | |
// private static ILexicalDatabase db = new NictWordNet(); | |
// | |
// //Similarity using wordnet: WUP measure | |
// public static double simOfWord( String word1, String word2 ) { | |
// WS4JConfiguration.getInstance().setMFS(true); | |
// | |
// double s = new WuPalmer(db).calcRelatednessOfWords(word1, word2); | |
// if (s >=1) return 1; | |
// return s; | |
// } | |
//Similarity using database/word2vec | |
public static double simOfVerb( String w1, String w2 ) throws IOException, SQLException { | |
//Lookup db | |
// if (w1.equals(w2)) return 1.0; | |
// ResultSet rs = | |
// DBConnector.q("select sim from sim.v2v where (w1='" + w1 + "' and w2='" + w2 + "') or "+ | |
// "(w1='"+w2 + "' and w2='"+w1 + "') limit 1"); | |
// if (rs.next()){ | |
// double sim = rs.getDouble(1); | |
// return sim; | |
// } | |
try { | |
return Word2VecRunner.simVerbs(w1,w2); | |
}catch(Exception e){ | |
return 0.0; | |
} | |
} | |
//Similarity using database/word2vec | |
public static double simOfNoun( String w1, String w2 ) throws IOException, SQLException { | |
// //Lookup db | |
// if (w1.equals(w2)) return 1.0; | |
// ResultSet rs = | |
// DBConnector.q("select p_sim from sim.n2n where (w1='" + w1 + "' and w2='" + w2 + "') or "+ | |
// "(w1='"+w2 + "' and w2='"+w1 + "') limit 1"); | |
// if (rs.next()){ | |
// double sim = rs.getDouble(1); | |
// return sim; | |
// } | |
// return 0.0; | |
try { | |
return Word2VecRunner.simNouns(w1,w2); | |
}catch(Exception e){ | |
return 0.0; | |
} | |
} | |
//Similar using word2vec without knowing POS | |
public static double simOfWord(String w1, String w2){ | |
try { | |
return Word2VecRunner.getSim().simWithoutPOS(w1, w2); | |
}catch(Exception e){ | |
return 0.0; | |
} | |
} | |
//Similarity using between two activity surfaces | |
public static double simOfPairW2V( String w1, String w2 ) throws Exception { | |
return Word2VecRunner.simPair(w1, w2); | |
} | |
//Similarity of two list: weight jaccard | |
public static double simOfListWord(List<String> l1, List<String> l2) throws IOException, SQLException{ | |
if (l1.size() == 0 || l2.size() == 0) return 0; | |
else{ | |
double sim = 0; | |
double total = 0; | |
for (int i=0; i<l1.size(); i++){ | |
for (int j=0; j<l2.size(); j++){ | |
sim += simOfWord(l1.get(i), l2.get(j)); | |
total++; | |
} | |
} | |
return sim/total; | |
} | |
} | |
//Similarity of two list of activity surfaces: weight jaccard | |
public static double simOfListActivity(List<String> l1, List<String> l2) throws Exception{ | |
if (l1.size() == 0 || l2.size() == 0) return 0; | |
else{ | |
double sim = 0; | |
double total = 0; | |
for (int i=0; i<l1.size(); i++){ | |
for (int j=0; j<l2.size(); j++){ | |
sim += simOfPairW2V(l1.get(i), l2.get(j)); | |
total++; | |
} | |
} | |
return sim/total; | |
} | |
} | |
//Jaccard | |
//Input: two arrays of noun | |
public static double simOfListWord(String[] s1, String[] s2) throws IOException, SQLException{ | |
List<String> l1 = new ArrayList<>(); | |
List<String> l2 = new ArrayList<>(); | |
for (int i=0; i<s1.length; i++){ | |
l1.add(s1[i]); | |
} | |
for (int i=0; i<s2.length; i++){ | |
l2.add(s2[i]); | |
} | |
return simOfListWord(l1, l2); | |
} | |
//Jaccard | |
//Input: two noun phrases | |
public static double simOfPhrase(String s1, String s2) throws SQLException, IOException{ | |
return simOfListWord(NormalizationText.removeStopwordInString(s1), | |
NormalizationText.removeStopwordInString(s2)); | |
} | |
/*//Jaccard | |
//Input: two lists of noun | |
public static double simOfListNoun(List<String> l1, List<String> l2, double threshold) throws IOException, SQLException{ | |
double total = l1.size() + l2.size(); | |
double inter = 0; | |
List<String> temp = new ArrayList<>(); | |
if (l1.size() == 0 && l2.size() == 0) return 0; | |
else if (l1.size() == 0) return 0/(l2.size()+1); | |
else if (l2.size() == 0) return 0/(l1.size()+1); | |
else{ | |
for (int i=0; i<l1.size(); i++){ | |
boolean check = false; | |
for (int j=0; j<temp.size(); j++){ | |
if (simOfNoun(l1.get(i), temp.get(j)) >= threshold){ | |
check = true; | |
} | |
} | |
for (int j=0; j<l2.size(); j++){ | |
if (simOfNoun(l1.get(i), l2.get(j)) >= threshold){ | |
inter++; | |
check = true; | |
temp.add(l2.get(j)); | |
l2.remove(j); | |
} | |
} | |
if (check == true) inter ++; | |
} | |
} | |
return inter/total; | |
}*/ | |
/*//Jaccard | |
//Input: two lists of verb | |
public static double simOfListVerb(List<String> l1, List<String> l2, double threshold) throws IOException, SQLException{ | |
double total = l1.size() + l2.size(); | |
double inter = 0; | |
List<String> temp = new ArrayList<>(); | |
if (l1.size() == 0 && l2.size() == 0) return 0; | |
else if (l1.size() == 0) return 0/(l2.size()+1); | |
else if (l2.size() == 0) return 0/(l1.size()+1); | |
else{ | |
for (int i=0; i<l1.size(); i++){ | |
boolean check = false; | |
for (int j=0; j<temp.size(); j++){ | |
if (simOfVerb(l1.get(i), temp.get(j)) >= threshold){ | |
check = true; | |
} | |
} | |
for (int j=0; j<l2.size(); j++){ | |
if (simOfVerb(l1.get(i), l2.get(j)) >= threshold){ | |
inter++; | |
check = true; | |
temp.add(l2.get(j)); | |
l2.remove(j); | |
} | |
} | |
if (check == true) inter ++; | |
} | |
} | |
return inter/total; | |
}*/ | |
// //Jaccard | |
// //Input: two arrays of noun | |
// public static double simOfListNoun(String[] s1, String[] s2, double threshold) throws IOException, SQLException{ | |
// List<String> l1 = new ArrayList<>(); | |
// List<String> l2 = new ArrayList<>(); | |
// for (int i=0; i<s1.length; i++){ | |
// l1.add(s1[i]); | |
// } | |
// for (int i=0; i<s2.length; i++){ | |
// l2.add(s2[i]); | |
// } | |
// return simOfListNoun(l1, l2, threshold); | |
// } | |
/*//Jaccard | |
//Input: two noun phrases | |
public static double simOfNounPhrase(String s1, String s2, double threshold) throws SQLException, IOException{ | |
return simOfListNoun(NormalizationText.removeStopwordInString(s1), | |
NormalizationText.removeStopwordInString(s2), threshold); | |
} | |
//Jaccard | |
//Input: two verb phrases | |
public static double simOfVerbPhrase(String s1, String s2, double threshold) throws SQLException, IOException{ | |
return simOfListVerb(NormalizationText.removeStopwordInString(s1), | |
NormalizationText.removeStopwordInString(s2), threshold); | |
}*/ | |
} |