diff --git a/.gitignore b/.gitignore index 6ac939f..c04cb90 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ hs_err_pid* *.iml +/resources/bigData ## Directory-based project format: .idea/ # if you remove the above rule, at least ignore the following: diff --git a/pom.xml b/pom.xml index c4101d5..c15b43d 100644 --- a/pom.xml +++ b/pom.xml @@ -10,6 +10,7 @@ + rules-generator @@ -64,6 +65,10 @@ de.mpii.frequentrulesminning.AssociationRulesMining association_rules + + de.mpii.predicatelifting.YagoFactsReducer + fact_reducer + diff --git a/src/resources/countries.tsv b/resources/countries.tsv similarity index 100% rename from src/resources/countries.tsv rename to resources/countries.tsv diff --git a/resources/yago_location_relations.tsv b/resources/yago_location_relations.tsv new file mode 100644 index 0000000..38fc08f --- /dev/null +++ b/resources/yago_location_relations.tsv @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/scripts/download_large_data.sh b/scripts/download_large_data.sh index 39ab950..fbad4fa 100644 --- a/scripts/download_large_data.sh +++ b/scripts/download_large_data.sh @@ -16,10 +16,12 @@ done PRGDIR=`dirname "$PRG"` BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` DATA_DIR=$BASEDIR/data +BIG_DATA_DIR=$BASEDIR/resources/bigData #make new directory for data mkdir -p $DATA_DIR +mkdir -p $BIG_DATA_DIR #Download @@ -38,10 +40,10 @@ for f in $DATA_DIR/*.7z; do 7z x $f -o$DATA_DIR done - +#TODO get bigData out of resources #Filter data for reduction -grep '' $DATA_DIR/yagoGeonamesOnlyData.tsv > $DATA_DIR/isLocatedInData.tsv -grep '' $DATA_DIR/yagoFacts.tsv >> $DATA_DIR/isLocatedInData.tsv +grep '' $DATA_DIR/yagoGeonamesOnlyData.tsv > $BIG_DATA_DIR/isLocatedInData.tsv +grep '' $DATA_DIR/yagoFacts.tsv >> $BIG_DATA_DIR/isLocatedInData.tsv #remove archives rm $DATA_DIR/*.7z diff --git a/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java b/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java index 67a6109..16df8ef 100644 --- a/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java +++ b/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java @@ -1,17 +1,14 @@ package de.mpii.predicatelifting; import com.google.common.collect.ImmutableSet; +import de.mpii.yagotools.YagoLocation; import de.mpii.yagotools.YagoSimpleTypes; import mpi.tools.basics3.Fact; import mpi.tools.basics3.FactSource; -import mpi.tools.javatools.filehandlers.UTF8Reader; import mpi.tools.javatools.util.FileUtils; -import java.io.File; -import java.io.IOException; -import java.net.MalformedURLException; -import java.nio.file.Files; -import java.util.Set; +import java.io.*; +import java.net.URL; /** * Created by gadelrab on 2/11/16. @@ -19,41 +16,62 @@ public class YagoFactsReducer { + public enum FactType{LOCATION,DATE,PERSON,ORGANIZATION,ARTIFACT} + + public static final String LOCATION_RELATIONS_FILE="resources/yago_location_relations.tsv"; + ImmutableSet locationRelations; YagoSimpleTypes yst; + YagoLocation yLoc; public YagoFactsReducer(){ - yst=YagoSimpleTypes.getInstance(); + //yst=YagoSimpleTypes.getInstance(); + yLoc=YagoLocation.getInstance(); + + + try { + locationRelations = ImmutableSet.copyOf(FileUtils.getFileContentasList(LOCATION_RELATIONS_FILE)); + } catch (IOException e) { + e.printStackTrace(); + } } - public void reduceToType(String factSourceFilePath, String []relations) { + public void reduceFacts(String factSourceFilePath, String outputFile, String []relations, FactType fType) { try { - reduceToType( FactSource.from(factSourceFilePath), relations); - } catch (MalformedURLException e) { + reduceFacts( FactSource.from(factSourceFilePath),FileUtils.getBufferedUTF8Writer(outputFile) ,relations,fType); + + } catch (IOException e) { e.printStackTrace(); } } - public void reduceToType(FactSource factSource, String []relations){ - - ImmutableSet relationsSet=ImmutableSet.copyOf(relations); + public void reduceFacts(FactSource factSource, BufferedWriter outputWriter, String []relations, FactType fType){ + ImmutableSet relationsSet=null; + if(relations!=null) + relationsSet=ImmutableSet.copyOf(relations); - for( Fact f:factSource) + for( Fact f:factSource) { + if (relationsSet==null||relationsSet.contains(f.getRelation())) { + Fact reducedfact = reduceFact(f, fType); + try { + outputWriter.write(reducedfact.toTsvLine()); + } catch (IOException e) { + e.printStackTrace(); + } - - if(relationsSet.contains(f.getRelation())){ - reduceFact(f); } + } + } @@ -62,13 +80,54 @@ public void reduceToType(FactSource factSource, String []relations){ - public Fact reduceFact(Fact orgFact) { + + public Fact reduceFact(Fact orgFact,FactType factType) { + if (factType==null){ + if(locationRelations.contains(orgFact)){ + return reduceLocationFact(orgFact); + } + } + else{ + switch (factType){ + case LOCATION: + return reduceLocationFact(orgFact); + + } + } return orgFact; } + private Fact reduceLocationFact(Fact orgFact) { + String entity=orgFact.getObject(); + String reduced=yLoc.getParentCountry(entity); + + return new Fact(orgFact.getSubject(),orgFact.getRelation(),reduced); + + } + + + public static void main(String [] args){ + + + if(args.length<2){ + System.out.println("Incorrect params: fact_reducer [Type]"); + System.exit(1); + } + + YagoFactsReducer fr=new YagoFactsReducer(); + + + + FactType type=null; + if (args.length>2) + type=FactType.valueOf(args[2]); + fr.reduceFacts(args[0],args[1],null,type); + + } + } diff --git a/src/main/java/de/mpii/yagotools/YagoLocation.java b/src/main/java/de/mpii/yagotools/YagoLocation.java index 6296dd9..07ed19d 100644 --- a/src/main/java/de/mpii/yagotools/YagoLocation.java +++ b/src/main/java/de/mpii/yagotools/YagoLocation.java @@ -7,11 +7,14 @@ import de.mpii.yagotools.utils.YagoRelations; import mpi.tools.javatools.util.FileUtils; +import java.lang.*; +import java.io.*; + import java.io.File; import java.io.IOException; import java.util.Collection; -import java.util.Collections; import java.util.Set; +import java.net.URL; /** @@ -20,8 +23,8 @@ public class YagoLocation { //private static final String SUB_CLASS_OF = "rdfs:subClassOf"; - String LOCATION_FILE_PATH ="data/isLocatedInData.tsv"; - final static String COUNTRIES_FILE="src/resources/countries.tsv"; + String LOCATION_FILE_PATH ="resources/bigData/isLocatedInData.tsv"; + final static String COUNTRIES_FILE="resources/countries.tsv"; private static YagoLocation instance; @@ -30,15 +33,18 @@ public class YagoLocation { private YagoLocation(){ - typesParents= YagoDataReader.loadSubject2ObjectMap(LOCATION_FILE_PATH,new String[]{YagoRelations.IS_LOCATED_IN}); loadCountries(); + try { + typesParents = YagoDataReader.loadDataInMap(LOCATION_FILE_PATH, new String[]{YagoRelations.IS_LOCATED_IN}, YagoDataReader.MapType.SUBJ_2_OBJ); + + }catch (Exception e) { + e.printStackTrace();} } private void loadCountries() { try { - String fileContect= FileUtils.getFileContent(new File(COUNTRIES_FILE)); - countriesSet= ImmutableSet.copyOf(fileContect.split("\n")); - } catch (IOException e) { + countriesSet= ImmutableSet.copyOf(FileUtils.getFileContentasList(COUNTRIES_FILE)); + } catch (Exception e) { e.printStackTrace(); } diff --git a/src/main/java/de/mpii/yagotools/YagoSimpleTypes.java b/src/main/java/de/mpii/yagotools/YagoSimpleTypes.java index 7fce6d3..4e6a245 100644 --- a/src/main/java/de/mpii/yagotools/YagoSimpleTypes.java +++ b/src/main/java/de/mpii/yagotools/YagoSimpleTypes.java @@ -20,7 +20,7 @@ public class YagoSimpleTypes { private YagoSimpleTypes(){ - entityTypes= YagoDataReader.loadSubject2ObjectMap(TAXONOMY_FILE_PATH,new String[]{YagoRelations.TYPE}); + entityTypes= YagoDataReader.loadDataInMap(TAXONOMY_FILE_PATH,new String[]{YagoRelations.TYPE}, YagoDataReader.MapType.SUBJ_2_OBJ); } diff --git a/src/main/java/de/mpii/yagotools/YagoTaxonomy.java b/src/main/java/de/mpii/yagotools/YagoTaxonomy.java index 8a793d9..92e905d 100644 --- a/src/main/java/de/mpii/yagotools/YagoTaxonomy.java +++ b/src/main/java/de/mpii/yagotools/YagoTaxonomy.java @@ -20,7 +20,7 @@ public class YagoTaxonomy { private YagoTaxonomy(){ - typesParents= YagoDataReader.loadSubject2ObjectMap(TAXONOMY_FILE_PATH,new String[]{YagoRelations.SUB_CLASS_OF}); + typesParents= YagoDataReader.loadDataInMap(TAXONOMY_FILE_PATH,new String[]{YagoRelations.SUB_CLASS_OF}, YagoDataReader.MapType.SUBJ_2_OBJ); } diff --git a/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java b/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java index adcd80c..6343a70 100644 --- a/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java +++ b/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java @@ -6,6 +6,7 @@ import mpi.tools.basics3.Fact; import mpi.tools.basics3.FactSource; import mpi.tools.javatools.filehandlers.UTF8Reader; +import java.net.URL; import java.io.File; @@ -14,13 +15,17 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.lang.*; +import java.io.*; /** * Created by gadelrab on 2/11/16. */ public class YagoDataReader { + public enum MapType{SUBJ_2_OBJ,PRED_OBJ_2_SUBJ} - public static Multimap loadSubject2ObjectMap(String filePath, String [] relations) { + + public static Multimap loadDataInMap(String filePath, String [] relations, MapType type) { // add them to set for searching ImmutableSet relationsSet=null; if (relations!=null) @@ -37,7 +42,18 @@ public static Multimap loadSubject2ObjectMap(String filePath, Str for(Fact f: FactSource.from(filePath)) if(relationsSet==null||relationsSet.contains(f.getRelation())){ - subjectObjectMap.put(f.getSubject(),f.getObject());} + String key=null; + String value=null; + switch (type){ + case SUBJ_2_OBJ: + key=f.getSubject(); + value=f.getObject(); + break; + case PRED_OBJ_2_SUBJ: + key=f.getRelation()+f.getObject(); + value=f.getSubject(); + } + subjectObjectMap.put(key,value);} System.out.println( "Dictionary size: "+ subjectObjectMap.size()); @@ -52,45 +68,47 @@ public static Multimap loadSubject2ObjectMap(String filePath, Str } - public static Multimap loadPredicateObject2subjectMap(String filePath, String [] relations) { - // add them to set for searching - ImmutableSet relationsSet=null; - if (relations!=null) - relationsSet=ImmutableSet.copyOf(relations); - - Multimap subjectObjectMap= HashMultimap.create(); - - UTF8Reader fileReader; - try { - fileReader=new UTF8Reader(new File(filePath),"Loading Data"); - - String line; - while((line=fileReader.readLine())!=null){ - String[] lineParts=line.split("\t"); - int subIndex=0,predIndex=1,objIndex=2; - if (lineParts.length>3){ - subIndex++;predIndex++;objIndex++; - } - if(relationsSet==null||relationsSet.contains(lineParts[predIndex])){ - subjectObjectMap.put(lineParts[predIndex] + lineParts[objIndex],lineParts[subIndex]); - } - - } - System.out.println( "Dictionary size: "+ subjectObjectMap.size()); - - return subjectObjectMap; - - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e1) { - e1.printStackTrace(); - } - - return null; - - } +// public static Multimap loadPredicateObject2subjectMap(String filePath, String [] relations) { +// // add them to set for searching +// ImmutableSet relationsSet=null; +// if (relations!=null) +// relationsSet=ImmutableSet.copyOf(relations); +// +// Multimap subjectObjectMap= HashMultimap.create(); +// +// UTF8Reader fileReader; +// try { +// fileReader=new UTF8Reader(new File(filePath),"Loading Data"); +// +// +// String line; +// +// while((line=fileReader.readLine())!=null){ +// String[] lineParts=line.split("\t"); +// int subIndex=0,predIndex=1,objIndex=2; +// if (lineParts.length>3){ +// subIndex++;predIndex++;objIndex++; +// } +// if(relationsSet==null||relationsSet.contains(lineParts[predIndex])){ +// subjectObjectMap.put(lineParts[predIndex] + lineParts[objIndex],lineParts[subIndex]); +// } +// +// } +// System.out.println( "Dictionary size: "+ subjectObjectMap.size()); +// +// return subjectObjectMap; +// +// } catch (FileNotFoundException e) { +// e.printStackTrace(); +// } catch (IOException e1) { +// e1.printStackTrace(); +// } +// +// return null; +// +// } } diff --git a/src/main/java/mpi/tools/javatools/datatypes/IntKeyMap.java b/src/main/java/mpi/tools/javatools/datatypes/IntKeyMap.java new file mode 100644 index 0000000..22edd93 --- /dev/null +++ b/src/main/java/mpi/tools/javatools/datatypes/IntKeyMap.java @@ -0,0 +1,256 @@ +package mpi.tools.javatools.datatypes; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import mpi.tools.javatools.administrative.D; + +/** + * + * This class is part of the Java Tools (see + * http://mpii.de/yago-naga/javatools). It is licensed under the Creative + * Commons Attribution License (see http://creativecommons.org/licenses/by/3.0) + * by the YAGO-NAGA team (see http://mpii.de/yago-naga). + * + * This class implements a HashMap with integer keys. + * + * @author Fabian M. Suchanek + * + * @param + */ +public class IntKeyMap { + + /** Holds the keys */ + protected int[] keys; + + /** Holds the values */ + protected K[] values; + + /** Holds size */ + protected int size; + + /** Constructor */ + public IntKeyMap() { + clear(); + } + + /** Creates an intHashMap from a list that contains keys and values in alternation*/ + public IntKeyMap(Object... keyValuePairs) { + this(Arrays.asList(keyValuePairs)); + } + + /** Creates an intHashMap from a list that contains keys and values in alternation*/ + @SuppressWarnings("unchecked") + public IntKeyMap(List keyValuePairs) { + this(); + for (int i = 0; i < keyValuePairs.size(); i += 2) { + Object key = keyValuePairs.get(i); + if (key instanceof Integer) + put((Integer) key, + (K) keyValuePairs.get(i + 1)); + else if (key instanceof Character) + put((int) ((Character) key).charValue(), + (K) keyValuePairs.get(i + 1)); + else if (key instanceof Byte) + put(((Byte) key).intValue(), + (K) keyValuePairs.get(i + 1)); + else if (key instanceof Short) + put(((Short) key).intValue(), + (K) keyValuePairs.get(i + 1)); + else + throw new RuntimeException("Keys have to be integers"); + } + } + + /** Returns an index where to store the object */ + protected int index(int key, int len) { + return (Math.abs(key) % len); + } + + /** Returns an index where to store the object */ + protected int index(int key) { + return (index(key, keys.length)); + } + + /** Retrieves a value */ + public K get(int key) { + return (get(key, null)); + } + + /** Finds a key, keys[find] will be Integer.MAX_VALUE if non-existent */ + protected int find(int key) { + int i = index(key); + while (true) { + if (keys[i] == Integer.MAX_VALUE) + return (i); + if (keys[i] == key) + return (i); + i++; + if (i == keys.length) + i = 0; + } + } + + /** Retrieves a value */ + public K get(int key, K defaultValue) { + int pos = find(key); + if (keys[pos] == Integer.MAX_VALUE) + return (defaultValue); + else + return ((K) values[pos]); + } + + /** True if value is there */ + public boolean containsKey(int key) { + return (keys[find(key)] != Integer.MAX_VALUE); + } + + /** Returns keys. Can be used only once. */ + public PeekIterator keys() { + final int[] e = keys; + return (new PeekIterator() { + + int pos = -1; + + @Override + protected Integer internalNext() throws Exception { + pos++; + for (; pos < keys.length; pos++) { + if (e[pos] != Integer.MAX_VALUE) { + return (e[pos]); + } + } + return (null); + } + + }); + } + + /** + * Adds a key, true for 'added the key as new', false for 'overwrote + * existing value' + */ + public boolean put(int key, K value) { + if (key == Integer.MAX_VALUE) + throw new RuntimeException( + "Integer.MAX_VALUE cannot be stored as key. Sorry..."); + if (put(keys, values, key, value)) { + size++; + if (size > keys.length * 3 / 4) + rehash(); + return (true); + } + return (false); + } + + /** + * Adds a key, true for 'added the key as new', false for 'overwrote + * existing value' + */ + protected boolean put(int[] keys, K[] values, int key, K value) { + int i = index(key, keys.length); + while (true) { + if (keys[i] == Integer.MAX_VALUE) { + keys[i] = key; + values[i] = value; + return (true); + } + if (keys[i] == key) { + values[i] = value; + return (false); + } + i++; + if (i == keys.length) + i = 0; + } + } + + /** Rehashes */ + protected void rehash() { + int[] newKeys = new int[keys.length * 2]; + Arrays.fill(newKeys, Integer.MAX_VALUE); + @SuppressWarnings("unchecked") + K[] newValues = (K[]) new Object[keys.length * 2]; + for (int i = 0; i < keys.length; i++) { + if (keys[i] != Integer.MAX_VALUE) + put(newKeys, newValues, keys[i], values[i]); + } + keys = newKeys; + values = newValues; + } + + public Iterator iterator() { + return keys().iterator(); + } + + public int size() { + return size; + } + + @SuppressWarnings("unchecked") + public void clear() { + size = 0; + keys = new int[10]; + Arrays.fill(keys, Integer.MAX_VALUE); + values = (K[]) new Object[10]; + } + + public boolean contains(int o) { + return containsKey(o); + } + + @Override + public String toString() { + if (isEmpty()) + return ("{}"); + StringBuilder b = new StringBuilder("{"); + int counter = 30; + for (int key : keys()) { + if (counter-- == 0) { + b.append("..., "); + break; + } + b.append(key).append('=').append(get(key)).append(", "); + } + b.setLength(b.length() - 2); + return (b.append("}").toString()); + } + + /** TRUE if there is no mapping*/ + public boolean isEmpty() { + return size == 0; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof IntKeyMap)) + return (false); + IntKeyMap other = (IntKeyMap) o; + if (other.size() != this.size()) + return (false); + for (int i = 0; i < keys.length; i++) { + if (keys[i] == Integer.MAX_VALUE && values[i] != other.get(keys[i])) + return (false); + } + return (true); + } + + @Override + public int hashCode() { + return Arrays.hashCode(values); + } + + /** Test */ + public static void main(String[] args) throws Exception { + IntKeyMap m = new IntKeyMap(); + for (int i = 1; i < 3000; i *= 2) { + m.put(i, "#" + i); + D.p("Added", i, m.size()); + } + D.p(m.keys); + m.put(8, "#0"); + for (int key : m.keys()) + D.p(key, m.get(key)); + } +} diff --git a/src/main/java/mpi/tools/javatools/util/FileUtils.java b/src/main/java/mpi/tools/javatools/util/FileUtils.java index b041cf1..4fe6d34 100644 --- a/src/main/java/mpi/tools/javatools/util/FileUtils.java +++ b/src/main/java/mpi/tools/javatools/util/FileUtils.java @@ -11,8 +11,12 @@ import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.Charset; +import java.util.ArrayList; import java.util.Collection; import java.util.LinkedList; +import java.util.List; +import java.lang.*; +import java.io.*; import gnu.trove.iterator.TObjectIntIterator; import gnu.trove.map.hash.TObjectIntHashMap; @@ -218,6 +222,59 @@ private static void getAllFilesRecursively( } } } + + /** + * Returns the content of the file as list. Linebreaks + * are encoded as unix newlines (\n). + *author: Gad + * @param fileName File to get String content from. + + * @return String content of file. + * + * @throws IOException + */ + public static List getFileContentasList(String fileName) throws IOException { + return getFileContentasList(new File(fileName)); + } + + public static List getFileContentasList(InputStream fileName) throws IOException { + ArrayList lines=new ArrayList<>(); + BufferedReader reader = getBufferedUTF8Reader(fileName); + for (String line = reader.readLine(); + line != null; + line = reader.readLine()){ + lines.add(line); + } + + + reader.close(); + return lines; + } + + /** + * Returns the content of the file as list. Linebreaks + * are encoded as unix newlines (\n). + *author: Gad + * @param file File to get String content from. + + * @return String content of file. + * + * @throws IOException + */ + public static List getFileContentasList(File file) throws IOException { + ArrayList lines=new ArrayList<>(); + BufferedReader reader = getBufferedUTF8Reader(file); + for (String line = reader.readLine(); + line != null; + line = reader.readLine()){ + lines.add(line); + } + + + reader.close(); + return lines; + } + public static void main(String[] args) throws IOException { verifyOrderedFile(new File(args[0]), false);