Skip to content
Permalink
0d82ff1dc4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
451 lines (407 sloc) 14 KB
package kb.howtokb.clustering.sim.w2v;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import edu.stanford.nlp.util.Pair;
import kb.howtokb.clustering.sim.Coefficient;
import kb.howtokb.utils.SortedMultiMap;
// Input is either at word level, annotated word level (e.g. with POS),
// or annotated phrase level (e.g. activity, location)
// Our analysis is that word level is not helpful: we don't know what is run (verb or noun)
// annotated phrase level is good but doesn't define relatedness e.g. prev/next are also related.
// annotated word level would check: sim [ (v1 n1) (v1 n1) ] = sim (v1,v2) x sim (n1,n2)
/**
* @author cxchu
*
*/
public class Word2VecRunner {
// private static SortedMultiMap<Pair<String, String>, Pair<String, String>> pairNeighbors;
// private static List<Pair<String, String>> activities;
private static Word2VecSimilarity sim;
// private static final Map<Pair<String, String>, Double> emptyPairMap =
// new HashMap<>();
//precomputation
// private static Set<Pair<Integer, Integer>> simAct;
// private static Map<Pair<String, String>, Integer> actToID = new HashMap<>();
private static Set<Pair<Integer, Integer>> simVerbPair;
private static Map<String, Integer> verbToID;
private static Set<Pair<Integer, Integer>> simNounPair;
private static Map<String, Integer> nounToID;
// private static double threshold = Coefficient.VVNN_TRHES;
// ////////////////////////////////////////////////////////
// TODO: Cuong -- for POSLevelWord2vec code begins here.
// //////////////////////////////////////////////////////
// private static int topK = 20;
// private static boolean isDesc = true;
public static void prepareData() throws Exception {
//load all strong activities
//not necessary
// String input = "all-strong-activities.txt";
//
// loadActivities(input);
//load word2vec model
sim =
new Word2VecSimilarity(
"articles-word2vec-word-pos.model.txt",
25, false);
// System.out.println("\n\n========================================== ["
// + activities.size() + " activities for neighborhood]\n");
//preComputeVerbs();
//preComputeNouns();
//preComputeActivity(sim);
// pairNeighbors =
// new SortedMultiMap<Pair<String, String>, Pair<String, String>>(topK,
// isDesc);
}
// public static Set<Pair<String, String>> getSimilarAct(String input) throws Exception{
// if (sim == null)
// prepareData();
// Pair<String, String> a1 = activityToPair(input);
// Set<Pair<String, String>> res = new HashSet<>();
// for (Pair<String, String> a2 : activities) {
// //String s = activity.first + ";" + activity.second;
//
// if (a1.equals(a2))
// continue;
//
// double simScore = simPair(a1, a2, sim);
// if (simScore > 0)
// pairNeighbors.put(a1, a2, simScore);
// }
// for (Pair<String, String> activity : pairNeighbors.keyset()) {
// String s = activity.first + ";" + activity.second;
// if (input.equals(s)){
//
// for (Entry<Pair<String, String>, Double> e1 : pairNeighbors
// .getAsMap(activity).entrySet()) {
// res.add(e1.getKey());
// }
// }
// }
// return res;
// }
//
// public static Set<String> getSimilarActString(String input) throws Exception{
// Set<Pair<String, String>> res = getSimilarAct(input);
// Set<String> set = new HashSet<>();
// for (Pair<String, String> e: res){
// set.add(e.first + ";" + e.second);
// }
// return set;
// }
// private static void loadActivities(String input) throws IOException{
// if (activities == null)
// activities = new ArrayList<>();
// ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
// InputStream inputs = classLoader.getResourceAsStream(input);
// try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputs, "UTF-8"))) {
//
// String sCurrentLine;
// while ((sCurrentLine = reader.readLine()) != null) {
// Pair<String, String> act = activityToPair(sCurrentLine);
// //if (!allVerbs.contains(act.first)) allVerbs.add(act.first);
// //if (!allNouns.contains(act.second)) allNouns.add(act.second);
// activities.add(act);
//
// }
// }
// }
/**
* Load all (activities,id) from file
* @param input
* @throws IOException
*/
// private static void loadActToID(String input) throws IOException{
// if (actToID == null)
// actToID = new HashMap<>();
// try (BufferedReader br = new BufferedReader(new FileReader(input))) {
//
// String sCurrentLine;
// while ((sCurrentLine = br.readLine()) != null) {
// String [] act_id = sCurrentLine.split("\t");
// Pair<String, String> act = activityToPair(act_id[0]);
// actToID.put(act, Integer.parseInt(act_id[1]));
// }
// }
// }
/**
* load all pair (verb, id) from file
* @param input
* @throws IOException
*/
// private static void loadVerbToID(String input) throws IOException{
// System.out.println("Load all strong verb and id........");
// if (verbToID == null)
// verbToID = new HashMap<>();
// try (BufferedReader br = new BufferedReader(new FileReader(input))) {
//
// String sCurrentLine;
// while ((sCurrentLine = br.readLine()) != null) {
// String [] v_id = sCurrentLine.split("\t");
// verbToID.put(v_id[0], Integer.parseInt(v_id[1]));
// }
// }
// System.out.println("Successfully! Number of verbs: " + verbToID.size());
// }
/**
* load all pair (noun, id) from file
* @param input
* @throws IOException
*/
// private static void loadNounToID(String input) throws IOException{
// System.out.println("Load all strong noun and id........");
// if (nounToID == null)
// nounToID = new HashMap<>();
// try (BufferedReader br = new BufferedReader(new FileReader(input))) {
//
// String sCurrentLine;
// while ((sCurrentLine = br.readLine()) != null) {
// String [] n_id = sCurrentLine.split("\t");
// nounToID.put(n_id[0], Integer.parseInt(n_id[1]));
// }
// }
// System.out.println("Successfully! Number of nouns: " + nounToID.size());
// }
private static Pair<String, String> activityToPair(String activity) {
String[] vn = activity.split(";");
return new Pair<String, String>(vn[0], vn[1]);
}
// private static Pair<Integer, Integer> stringIntToPair(String s) {
// String[] vn = s.split(";");
// return new Pair<Integer, Integer>(Integer.parseInt(vn[0]), Integer.parseInt(vn[1]));
// }
//
// public static Map<Pair<String, String>, Double> getPairNeighbors(
// Pair<String, String> w) {
// return !pairNeighbors.containsKey(w) ? emptyPairMap : pairNeighbors
// .getAsMap(w);
// }
/**
* Precompute similarity between two activities
* @return
* @throws Exception
*/
// public static void preComputeActivity(Word2VecSimilarity sim){
// System.out.println("Pre-computing similar pair of activities......");
// if (simAct == null)
// simAct = new HashSet<>();
// int count = 0;
// for (int i=0; i<activities.size(); i++){
// for (int j=i; j<activities.size(); j++){
// if (simPair(activities.get(i), activities.get(j), sim) >= threshold){
// simAct.add(new Pair<Integer, Integer>(actToID.get(activities.get(i)), actToID.get(activities.get(j))));
// count++;
// System.out.println(activities.get(i) + "\t" + activities.get(j));
// }
// }
// }
// System.out.println("Total of similar activity pairs: " + count);
// }
/**
* load all similar activity pairs
* @throws IOException
* @throws FileNotFoundException
*
*/
// private static void loadSimActPair(String input) throws FileNotFoundException, IOException{
// if (simAct == null)
// simAct = new HashSet<>();
// try (BufferedReader br = new BufferedReader(new FileReader(input))) {
//
// String sCurrentLine;
// while ((sCurrentLine = br.readLine()) != null) {
// simAct.add(stringIntToPair(sCurrentLine));
// }
// }
// }
/**
* load all similar verb pair
* @param input
* @throws FileNotFoundException
* @throws IOException
*/
// private static void loadSimVerbPair(String input) throws FileNotFoundException, IOException{
// System.out.println("Load similar pair of verb.............");
// if (simVerbPair == null)
// simVerbPair = new HashSet<>();
// try (BufferedReader br = new BufferedReader(new FileReader(input))) {
//
// String sCurrentLine;
// while ((sCurrentLine = br.readLine()) != null) {
// simVerbPair.add(stringIntToPair(sCurrentLine));
// }
// }
// System.out.println("Done! Number of pairs: " + simVerbPair.size());
// }
/**
* load all similar noun pair
* @param input
* @throws FileNotFoundException
* @throws IOException
*/
// private static void loadSimNounPair(String input) throws FileNotFoundException, IOException{
// System.out.println("Load similar pair of noun.............");
// if (simNounPair == null)
// simNounPair = new HashSet<>();
// try (BufferedReader br = new BufferedReader(new FileReader(input))) {
//
// String sCurrentLine;
// while ((sCurrentLine = br.readLine()) != null) {
// simNounPair.add(stringIntToPair(sCurrentLine));
// }
// }
// System.out.println("Done! Number of pairs: " + simNounPair.size());
// }
/**
* Check whether similarity between two activities are greater than a given threhold.
* @param two strings "paint;wall" and "color;ceiling"
* @return true/false
* @throws Exception
*/
// public static boolean isSim(String a1, String a2) throws Exception{
// if (sim == null)
// prepareData();
// Pair<String, String> activity1 = activityToPair(a1);
// Pair<String, String> activity2 = activityToPair(a2);
// int id1 = actToID.get(activity1);
// int id2 = actToID.get(activity2);
// if (simAct.contains(new Pair<Integer, Integer>(id1, id2)))
// return true;
// return simAct.contains(new Pair<Integer, Integer>(id2, id1));
// }
/**
* Check whether similarity between two activities are greater than a given threhold.
* in a simpler way to prune false negative
* if two verbs are dissimilar and two noun are dissimilar, then two acts are dissimilar
* else we are not sure and return true
* @param two strings "paint;wall" and "color;ceiling"
* @return true/false
* @throws Exception
*/
public static boolean isSim(String a1, String a2) throws Exception{
if (sim == null)
prepareData();
Pair<String, String> activity1 = activityToPair(a1);
Pair<String, String> activity2 = activityToPair(a2);
if (!isSimVerb(activity1.first, activity2.first))
return false;
if (!isSimNoun(activity1.second, activity2.second))
return false;
return true;
}
/**
* check whether similarity between two verbs is greater than a given threshold
* @param v1
* @param v2
* @return
* @throws Exception
*/
public static boolean isSimVerb(String v1, String v2) throws Exception{
if (sim == null)
prepareData();
int id1 = verbToID.get(v1);
int id2 = verbToID.get(v2);
if (simVerbPair.contains(new Pair<Integer, Integer>(id1, id2)))
return true;
return simVerbPair.contains(new Pair<Integer, Integer>(id2, id1));
}
/**
* check whether similarity between two verbs is greater than a given threshold
* @param v1
* @param v2
* @return
* @throws Exception
*/
public static boolean isSimNoun(String n1, String n2) throws Exception{
if (sim == null)
prepareData();
int id1 = nounToID.get(n1);
int id2 = nounToID.get(n2);
if (simNounPair.contains(new Pair<Integer, Integer>(id1, id2)))
return true;
return simNounPair.contains(new Pair<Integer, Integer>(id2, id1));
}
public static double simVerbs(String v1, String v2) throws Exception{
if (sim == null)
prepareData();
v1 = v1.contains(" ")?v1.split(" ")[0]:v1;
v2 = v2.contains(" ")?v2.split(" ")[0]:v2;
return sim.sim("v_" + v1, "v_" + v2);
}
public static double simNouns(String n1, String n2) throws Exception{
if (sim == null)
prepareData();
String n1_tmp = n1.contains(" ")?n1.split(" ")[n1.split(" ").length - 1]:n1;
String n2_tmp = n2.contains(" ")?n2.split(" ")[n2.split(" ").length - 1]:n2;
return sim.sim("n_" + n1_tmp, "n_" + n2_tmp);
}
public static double simPair(Pair<String, String> activity1,
Pair<String, String> activity2, Word2VecSimilarity sim) {
try {
double vv = simVerbs(activity1.first, activity2.first);
if (vv == 0)
return 0.0;
double nn = simNouns(activity1.second, activity2.second);
return combinePairScore(vv, nn);
} catch (Exception e) {
return 0.0;
}
}
public static double simPair(Pair<String, String> activity1,
Pair<String, String> activity2) throws Exception {
if (sim == null)
prepareData();
try {
double vv = simVerbs(activity1.first, activity2.first);
if (vv == 0)
return 0.0;
double nn = simNouns(activity1.second, activity2.second);
return combinePairScore(vv, nn);
} catch (Exception e) {
return 0.0;
}
}
public static double simPair(String a1,
String a2, Word2VecSimilarity sim) {
Pair<String, String> activity1 = activityToPair(a1);
Pair<String, String> activity2 = activityToPair(a2);
return simPair(activity1, activity2, sim);
}
public static double simPair(String a1,
String a2) throws Exception {
if (sim == null)
prepareData();
Pair<String, String> activity1 = activityToPair(a1);
Pair<String, String> activity2 = activityToPair(a2);
return simPair(activity1, activity2, sim);
}
public static Word2VecSimilarity getSim() throws Exception {
if (sim == null)
prepareData();
return sim;
}
private static double combinePairScore(double cosine1, double cosine2) {
return cosine1 * cosine2;
}
public static void main(String[] args) throws Exception {
// Word2VecSimilarity sim =
// new Word2VecSimilarity(
// "/var/tmp/cxchu/data-server/articles-word2vec-word-pos.model.txt",
// 50, false);
//
// System.out.println(simPair("watch;movie", "watch;film", sim));
// System.out.println(simPair("watch;movie", "eat;popcorn", sim));
// System.out.println(simPair("watch;film", "eat;popcorn", sim));
prepareData();
}
}