Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
224 lines (200 sloc) 6.15 KB
package kb.howtokb.clustering.sim;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import kb.howtokb.clustering.sim.w2v.Word2VecRunner;
import kb.howtokb.tools.NormalizationText;
public class StringSimilarity {
// private static ILexicalDatabase db = new NictWordNet();
//
// //Similarity using wordnet: WUP measure
// public static double simOfWord( String word1, String word2 ) {
// WS4JConfiguration.getInstance().setMFS(true);
//
// double s = new WuPalmer(db).calcRelatednessOfWords(word1, word2);
// if (s >=1) return 1;
// return s;
// }
//Similarity using database/word2vec
public static double simOfVerb( String w1, String w2 ) throws IOException, SQLException {
//Lookup db
// if (w1.equals(w2)) return 1.0;
// ResultSet rs =
// DBConnector.q("select sim from sim.v2v where (w1='" + w1 + "' and w2='" + w2 + "') or "+
// "(w1='"+w2 + "' and w2='"+w1 + "') limit 1");
// if (rs.next()){
// double sim = rs.getDouble(1);
// return sim;
// }
try {
return Word2VecRunner.simVerbs(w1,w2);
}catch(Exception e){
return 0.0;
}
}
//Similarity using database/word2vec
public static double simOfNoun( String w1, String w2 ) throws IOException, SQLException {
// //Lookup db
// if (w1.equals(w2)) return 1.0;
// ResultSet rs =
// DBConnector.q("select p_sim from sim.n2n where (w1='" + w1 + "' and w2='" + w2 + "') or "+
// "(w1='"+w2 + "' and w2='"+w1 + "') limit 1");
// if (rs.next()){
// double sim = rs.getDouble(1);
// return sim;
// }
// return 0.0;
try {
return Word2VecRunner.simNouns(w1,w2);
}catch(Exception e){
return 0.0;
}
}
//Similar using word2vec without knowing POS
public static double simOfWord(String w1, String w2){
try {
return Word2VecRunner.getSim().simWithoutPOS(w1, w2);
}catch(Exception e){
return 0.0;
}
}
//Similarity using between two activity surfaces
public static double simOfPairW2V( String w1, String w2 ) throws Exception {
return Word2VecRunner.simPair(w1, w2);
}
//Similarity of two list: weight jaccard
public static double simOfListWord(List<String> l1, List<String> l2) throws IOException, SQLException{
if (l1.size() == 0 || l2.size() == 0) return 0;
else{
double sim = 0;
double total = 0;
for (int i=0; i<l1.size(); i++){
for (int j=0; j<l2.size(); j++){
sim += simOfWord(l1.get(i), l2.get(j));
total++;
}
}
return sim/total;
}
}
//Similarity of two list of activity surfaces: weight jaccard
public static double simOfListActivity(List<String> l1, List<String> l2) throws Exception{
if (l1.size() == 0 || l2.size() == 0) return 0;
else{
double sim = 0;
double total = 0;
for (int i=0; i<l1.size(); i++){
for (int j=0; j<l2.size(); j++){
sim += simOfPairW2V(l1.get(i), l2.get(j));
total++;
}
}
return sim/total;
}
}
//Jaccard
//Input: two arrays of noun
public static double simOfListWord(String[] s1, String[] s2) throws IOException, SQLException{
List<String> l1 = new ArrayList<>();
List<String> l2 = new ArrayList<>();
for (int i=0; i<s1.length; i++){
l1.add(s1[i]);
}
for (int i=0; i<s2.length; i++){
l2.add(s2[i]);
}
return simOfListWord(l1, l2);
}
//Jaccard
//Input: two noun phrases
public static double simOfPhrase(String s1, String s2) throws SQLException, IOException{
return simOfListWord(NormalizationText.removeStopwordInString(s1),
NormalizationText.removeStopwordInString(s2));
}
/*//Jaccard
//Input: two lists of noun
public static double simOfListNoun(List<String> l1, List<String> l2, double threshold) throws IOException, SQLException{
double total = l1.size() + l2.size();
double inter = 0;
List<String> temp = new ArrayList<>();
if (l1.size() == 0 && l2.size() == 0) return 0;
else if (l1.size() == 0) return 0/(l2.size()+1);
else if (l2.size() == 0) return 0/(l1.size()+1);
else{
for (int i=0; i<l1.size(); i++){
boolean check = false;
for (int j=0; j<temp.size(); j++){
if (simOfNoun(l1.get(i), temp.get(j)) >= threshold){
check = true;
}
}
for (int j=0; j<l2.size(); j++){
if (simOfNoun(l1.get(i), l2.get(j)) >= threshold){
inter++;
check = true;
temp.add(l2.get(j));
l2.remove(j);
}
}
if (check == true) inter ++;
}
}
return inter/total;
}*/
/*//Jaccard
//Input: two lists of verb
public static double simOfListVerb(List<String> l1, List<String> l2, double threshold) throws IOException, SQLException{
double total = l1.size() + l2.size();
double inter = 0;
List<String> temp = new ArrayList<>();
if (l1.size() == 0 && l2.size() == 0) return 0;
else if (l1.size() == 0) return 0/(l2.size()+1);
else if (l2.size() == 0) return 0/(l1.size()+1);
else{
for (int i=0; i<l1.size(); i++){
boolean check = false;
for (int j=0; j<temp.size(); j++){
if (simOfVerb(l1.get(i), temp.get(j)) >= threshold){
check = true;
}
}
for (int j=0; j<l2.size(); j++){
if (simOfVerb(l1.get(i), l2.get(j)) >= threshold){
inter++;
check = true;
temp.add(l2.get(j));
l2.remove(j);
}
}
if (check == true) inter ++;
}
}
return inter/total;
}*/
// //Jaccard
// //Input: two arrays of noun
// public static double simOfListNoun(String[] s1, String[] s2, double threshold) throws IOException, SQLException{
// List<String> l1 = new ArrayList<>();
// List<String> l2 = new ArrayList<>();
// for (int i=0; i<s1.length; i++){
// l1.add(s1[i]);
// }
// for (int i=0; i<s2.length; i++){
// l2.add(s2[i]);
// }
// return simOfListNoun(l1, l2, threshold);
// }
/*//Jaccard
//Input: two noun phrases
public static double simOfNounPhrase(String s1, String s2, double threshold) throws SQLException, IOException{
return simOfListNoun(NormalizationText.removeStopwordInString(s1),
NormalizationText.removeStopwordInString(s2), threshold);
}
//Jaccard
//Input: two verb phrases
public static double simOfVerbPhrase(String s1, String s2, double threshold) throws SQLException, IOException{
return simOfListVerb(NormalizationText.removeStopwordInString(s1),
NormalizationText.removeStopwordInString(s2), threshold);
}*/
}