From b4fd61dba7e41f468595dbb305677ff02d4542a2 Mon Sep 17 00:00:00 2001 From: gadelrab Date: Tue, 16 Feb 2016 11:00:16 +0100 Subject: [PATCH] adding yago facts and locations --- scripts/download_large_data.sh | 14 +++++- .../predicatelifting/YagoFactsReducer.java | 15 ++---- .../java/de/mpii/yagotools/YagoLocation.java | 49 +++++++++++++++++-- .../mpii/yagotools/utils/YagoDataReader.java | 27 +++++----- .../mpii/yagotools/utils/YagoRelations.java | 1 + .../mpi/tools/javatools/datatypes/Pair.java | 2 +- 6 files changed, 75 insertions(+), 33 deletions(-) diff --git a/scripts/download_large_data.sh b/scripts/download_large_data.sh index 7c3b677..39ab950 100644 --- a/scripts/download_large_data.sh +++ b/scripts/download_large_data.sh @@ -26,10 +26,22 @@ mkdir -p $DATA_DIR #wget -P $DATA_DIR wget -P $DATA_DIR http://resources.mpi-inf.mpg.de/yago-naga/yago/download/yago/yagoTaxonomy.tsv.7z wget -P $DATA_DIR http://resources.mpi-inf.mpg.de/yago-naga/yago/download/yago/yagoSimpleTypes.tsv.7z +wget -P $DATA_DIR http://resources.mpi-inf.mpg.de/yago-naga/yago/download/yago/yagoGeonamesOnlyData.tsv.7z +wget -P $DATA_DIR http://resources.mpi-inf.mpg.de/yago-naga/yago/download/yago/yagoFacts.tsv.7z + #uncompress -7z x $DATA_DIR/*.7z -o$DATA_DIR + +for f in $DATA_DIR/*.7z; do + echo $f + 7z x $f -o$DATA_DIR +done + + +#Filter data for reduction +grep '' $DATA_DIR/yagoGeonamesOnlyData.tsv > $DATA_DIR/isLocatedInData.tsv +grep '' $DATA_DIR/yagoFacts.tsv >> $DATA_DIR/isLocatedInData.tsv #remove archives rm $DATA_DIR/*.7z diff --git a/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java b/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java index 477388a..67a6109 100644 --- a/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java +++ b/src/main/java/de/mpii/predicatelifting/YagoFactsReducer.java @@ -18,29 +18,20 @@ */ public class YagoFactsReducer { - final static String COUNTRIES_FILE="resources/countries.tsv"; + YagoSimpleTypes yst; - ImmutableSet countriesSet; + public YagoFactsReducer(){ yst=YagoSimpleTypes.getInstance(); - //location-based load countries - loadLocations(); - } - private void loadLocations() { - try { - String fileContect= FileUtils.getFileContent(new File(COUNTRIES_FILE)); - countriesSet=ImmutableSet.copyOf(fileContect.split("\n")); - } catch (IOException e) { - e.printStackTrace(); - } } + public void reduceToType(String factSourceFilePath, String []relations) { try { reduceToType( FactSource.from(factSourceFilePath), relations); diff --git a/src/main/java/de/mpii/yagotools/YagoLocation.java b/src/main/java/de/mpii/yagotools/YagoLocation.java index 60a8587..6296dd9 100644 --- a/src/main/java/de/mpii/yagotools/YagoLocation.java +++ b/src/main/java/de/mpii/yagotools/YagoLocation.java @@ -1,8 +1,17 @@ package de.mpii.yagotools; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; import de.mpii.yagotools.utils.YagoDataReader; import de.mpii.yagotools.utils.YagoRelations; +import mpi.tools.javatools.util.FileUtils; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Set; /** @@ -11,19 +20,32 @@ public class YagoLocation { //private static final String SUB_CLASS_OF = "rdfs:subClassOf"; - String TAXONOMY_FILE_PATH="data/yagoTaxonomy.tsv"; + String LOCATION_FILE_PATH ="data/isLocatedInData.tsv"; + final static String COUNTRIES_FILE="src/resources/countries.tsv"; private static YagoLocation instance; private Multimap typesParents; + ImmutableSet countriesSet; private YagoLocation(){ - typesParents= YagoDataReader.loadSubject2ObjectMap(TAXONOMY_FILE_PATH,new String[]{YagoRelations.SUB_CLASS_OF}); + typesParents= YagoDataReader.loadSubject2ObjectMap(LOCATION_FILE_PATH,new String[]{YagoRelations.IS_LOCATED_IN}); + loadCountries(); + } + + private void loadCountries() { + try { + String fileContect= FileUtils.getFileContent(new File(COUNTRIES_FILE)); + countriesSet= ImmutableSet.copyOf(fileContect.split("\n")); + } catch (IOException e) { + e.printStackTrace(); + } } + public static YagoLocation getInstance(){ if (instance==null){ instance=new YagoLocation(); @@ -32,8 +54,29 @@ public static YagoLocation getInstance(){ } - public static void main (String [] args){ + public String getParentCountry(String entity){ + + String parentCountry; + if (countriesSet.contains(entity)) + return entity; + + Collection parents=typesParents.get(entity); + //System.out.println(parents); + Set countryParents=Sets.intersection(ImmutableSet.copyOf(parents), countriesSet); + //System.out.println(countryParents); + + if(countryParents==null||countryParents.isEmpty()) + return entity; + else + return (String) countryParents.toArray()[0]; + } + + + public static void main (String [] args) { YagoLocation yt= YagoLocation.getInstance(); + System.out.println(yt.getParentCountry("")); + System.out.println(yt.getParentCountry("")); + System.out.println(yt.getParentCountry("")); } diff --git a/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java b/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java index d0c350d..adcd80c 100644 --- a/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java +++ b/src/main/java/de/mpii/yagotools/utils/YagoDataReader.java @@ -3,6 +3,8 @@ import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; +import mpi.tools.basics3.Fact; +import mpi.tools.basics3.FactSource; import mpi.tools.javatools.filehandlers.UTF8Reader; @@ -10,6 +12,7 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.net.MalformedURLException; /** * Created by gadelrab on 2/11/16. @@ -27,30 +30,22 @@ public static Multimap loadSubject2ObjectMap(String filePath, Str UTF8Reader fileReader; try { - fileReader=new UTF8Reader(new File(filePath),"Loading Data"); - String line; - while((line=fileReader.readLine())!=null){ - String[] lineParts=line.split("\t"); - int subIndex=0,predIndex=1,objIndex=2; - if (lineParts.length>3){ - subIndex++;predIndex++;objIndex++; - } - if(relationsSet==null||relationsSet.contains(lineParts[predIndex])){ - subjectObjectMap.put(lineParts[subIndex],lineParts[objIndex]); - } + //String line; + + for(Fact f: FactSource.from(filePath)) + if(relationsSet==null||relationsSet.contains(f.getRelation())){ + subjectObjectMap.put(f.getSubject(),f.getObject());} - } System.out.println( "Dictionary size: "+ subjectObjectMap.size()); return subjectObjectMap; - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e1) { - e1.printStackTrace(); + + } catch (MalformedURLException e2) { + e2.printStackTrace(); } return null; diff --git a/src/main/java/de/mpii/yagotools/utils/YagoRelations.java b/src/main/java/de/mpii/yagotools/utils/YagoRelations.java index 147393f..b5f3f7b 100644 --- a/src/main/java/de/mpii/yagotools/utils/YagoRelations.java +++ b/src/main/java/de/mpii/yagotools/utils/YagoRelations.java @@ -10,4 +10,5 @@ public class YagoRelations { public static final String SUB_CLASS_OF="rdfs:subClassOf"; + public static final String IS_LOCATED_IN=""; } diff --git a/src/main/java/mpi/tools/javatools/datatypes/Pair.java b/src/main/java/mpi/tools/javatools/datatypes/Pair.java index 4733439..ded99f2 100644 --- a/src/main/java/mpi/tools/javatools/datatypes/Pair.java +++ b/src/main/java/mpi/tools/javatools/datatypes/Pair.java @@ -1,6 +1,6 @@ package mpi.tools.javatools.datatypes; -import mpi.aida.data.PreparedInputChunk; + import java.io.Serializable;