Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…2018 into dev
  • Loading branch information
renewiegandt committed Dec 10, 2018
2 parents b45f676 + ffed012 commit 037b2bf
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 56 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH

Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018).
The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file.
On some systems Nrxtflow exits the run with following error:
On some systems Nextflow exits the run with following error:
```
Caused by:
Failed to create Conda environment
Expand All @@ -42,19 +42,20 @@ When the enviroment is created, set the variable 'path_env' in the configuration
nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file]
```
## Parameters
For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration).
```
Required arguments:
--input Path to BigWig-file
--bed Path to BED-file
--input Path to BigWig-file with scores on the peaks of interest
--bed Path to BED-file with peaks of interest corresponding to the BigWig file
--genome_fasta Path to genome in FASTA-format
--jaspar_db Path to motif-database in MEME-format
Optional arguments:
Footprint extraction:
--window_length INT (Default: 200)
--step INT (Default: 100)
--percentage INT(Default: 0)
--window_length INT (Default: 200) a length of a window
--step INT (Default: 100) an interval to slide the window
--percentage INT(Default: 0) a percentage to be added to background while searching for footprints
Filter unknown motifs:
--min_size_fp INT (Default: 10)
Expand All @@ -81,7 +82,7 @@ Optional arguments:
--interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000)
--tomtom_treshold float Threshold for similarity score. (Default: 0.01)
Moitf clustering:
Motif clustering:
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
Expand Down
11 changes: 7 additions & 4 deletions bin/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class ActivityCategorizer:

def __init__(self, release, organism, wd):
def __init__(self, release, organism, wd, data_dir):

# List of all Folders with Activity Tables

Expand All @@ -18,7 +18,7 @@ def __init__(self, release, organism, wd):

self.activity = {}

self.get_activity_data(release, organism, wd)
self.get_activity_data(release, organism, wd, data_dir)

# Categorized Activity from Json-config
print("Categorization: This may take a while")
Expand All @@ -41,11 +41,14 @@ def read_config(self, organism, wd):

return c_dict

def get_activity_data(self, release, organism, wd):
def get_activity_data(self, release, organism, wd, data_dir):

for folder in self.folderlist:
# Generate path to binary File
file = os.path.join(wd + "/EnsemblData", release, organism, "activity", folder, "table.bin")
if data_dir:
file = os.path.join(data_dir + "/EnsemblData", release, organism, "activity", folder, "table.bin")
else:
file = os.path.join(wd + "/data/EnsemblData", release, organism, "activity", folder, "table.bin")
with open(file, "rb") as tables:
self.activity[folder] = bytearray(tables.read())

Expand Down
7 changes: 5 additions & 2 deletions bin/Modules/Ensembl/ActivityTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ class ActivityTable:
4, "activity=NA"
"""

def __init__(self, organism, current_release, wd):
self.link = os.path.join(wd + "/EnsemblData/", current_release, organism, "activity")
def __init__(self, organism, current_release, wd, data_dir):
if data_dir:
self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity")
else:
self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity")
self.folders = next(os.walk(self.link))[1]
self.generator = ATGenerator(["activity=ACTIVE",
"activity=POISED",
Expand Down
10 changes: 5 additions & 5 deletions bin/Modules/Ensembl/Ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@

class Ensembl:

def __init__(self, organism, wd):
def __init__(self, organism, wd, data_dir):
print("Starting Ensembl")
self.updater = FTPRetriever(organism, wd)
self.updater = FTPRetriever(organism, wd, data_dir)
self.release = self.updater.get_release()
self.acttable = ActivityTable(organism, self.release, wd)
self.acttable = ActivityTable(organism, self.release, wd, data_dir)
self.acttable.check_and_generate_activity_table()
self.categorizer = ActivityCategorizer(self.release, organism, wd)
self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir)
print("Generating GTF")
self.gtf_generator = GTFGen(organism, self.release, wd)
self.gtf_generator = GTFGen(organism, self.release, wd, data_dir)

print("Ensembl Finished !")

Expand Down
4 changes: 2 additions & 2 deletions bin/Modules/Ensembl/FTPHandling/URLRetrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def change_dir(self, wd):
def get_all_entries(self):
return self.ftp.nlst()

def get_all_entries_from_dir(self, dir):
self.change_dir(dir)
def get_all_entries_from_dir(self, dire):
self.change_dir(dire)
return self.get_all_entries()

def get_all_entries_as_FTPEntry(self):
Expand Down
44 changes: 28 additions & 16 deletions bin/Modules/Ensembl/FTPHandling/VersionChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever:
And downloading newest version if necessary
"""

def __init__(self, organism, wd):
def __init__(self, organism, wd, data_dir):
self.site_ftp = FTPHandler("ftp.ensembl.org", "pub")
self.remoteversion = self.get_current_ftp_version()
self.localversion = self.get_current_local_version(wd)
if self.check_version_difference(organism, wd):
self.download_currentversion_version(self.remoteversion, organism, wd)
self.localversion = self.get_current_local_version(wd, data_dir)
if self.check_version_difference(organism, wd, data_dir):
self.download_currentversion_version(self.remoteversion, organism, wd, data_dir)
else:
print("Newest Version installed, no update needed.")

Expand All @@ -31,15 +31,25 @@ def get_current_ftp_version(self):
print("Current release is "+c_release)
return c_release

def check_organism(self, organism, release, wd):
if organism in next(os.walk(os.path.join(wd+"/EnsemblData/"+release+"/")))[1]:
return False
def check_organism(self, organism, release, wd, data_dir):
if data_dir:
if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]:
return False
else:
print("No Local Version for "+organism+" installed. Installing...")
return True
else:
print("No Local Version for "+organism+" installed. Installing...")
return True
if organism in next(os.walk(os.path.join(wd+"/data/EnsemblData/"+release+"/")))[1]:
return False
else:
print("No Local Version for "+organism+" installed. Installing...")
return True

def get_current_local_version(self, wd):
directories = next(os.walk(os.path.join(wd+"/EnsemblData/")))[1]
def get_current_local_version(self, wd, data_dir):
if data_dir:
directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1]
else:
directories = next(os.walk(os.path.join(wd+"/data/EnsemblData/")))[1]
for dir in directories:
if "release" in dir:
localversion = sorted(directories, reverse=True)[0]
Expand All @@ -51,7 +61,7 @@ def get_current_local_version(self, wd):
print("No Version installed !")
return None

def check_version_difference(self, organism, wd):
def check_version_difference(self, organism, wd, data_dir):

local_version = self.localversion
remote_version = self.remoteversion
Expand All @@ -64,16 +74,18 @@ def check_version_difference(self, organism, wd):
print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version)
return True
else:
if self.check_organism(organism, local_version, wd):
if self.check_organism(organism, local_version, wd, data_dir):
return True
else:
return False

def download_currentversion_version(self, version, organism, wd):
def download_currentversion_version(self, version, organism, wd, data_dir):

# Download Base File

targetfolder = os.path.join(wd+"/EnsemblData/", version, organism)
if data_dir:
targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism)
else:
targetfolder = os.path.join(wd+"/data/EnsemblData/", version, organism)
os.makedirs(targetfolder)
folder_url = "/pub/"+version+"/regulation/"+organism+"/"
self.site_ftp.change_dir(folder_url)
Expand Down
12 changes: 7 additions & 5 deletions bin/Modules/Ensembl/GTFGen.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@

class GTFGen:

def __init__(self, organism, release, wd):
def __init__(self, organism, release, wd, data_dir):

self.gff_lines = self.get_organism_as_gff(organism, release, wd)
self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir)
self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"}

def get_organism_as_gff(self, organism, release, wd):

directory = os.path.join(wd + "/EnsemblData/", release, organism)
def get_organism_as_gff(self, organism, release, wd, data_dir):
if data_dir:
directory = os.path.join(data_dir + "/EnsemblData/", release, organism)
else:
directory = os.path.join(wd + "/data/EnsemblData/", release, organism)
inputfile = ""
for file in os.listdir(directory):
if file.endswith("gff.gz"):
Expand Down
7 changes: 5 additions & 2 deletions bin/Modules/ucsc/ucsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@

class UcscGtf:

def __init__(self, org, wd):
def __init__(self, org, wd, data_dir):
self.organism_id = self.get_organism_id(org)
self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb"
self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed")
if data_dir:
self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed")
else:
self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed")
self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed")
print("Getting UCSC Data")
print("Path to Bin: " + self.path_to_bin)
Expand Down
24 changes: 12 additions & 12 deletions bin/RegGTFExtractor.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ def check_for_local_folder(wd):

if not os.path.isdir(os.path.join(wd+"/EnsemblData")):

os.mkdir(os.path.join(wd+"/EnsemblData"))
os.mkdir(os.path.join(wd+"/data/EnsemblData"))

if not os.path.isdir(os.path.join(wd+"/UCSCData" )):
os.mkdir(os.path.join(wd+"/UCSCData" ))
if not os.path.isdir(os.path.join(wd+"/data/UCSCData")):
os.mkdir(os.path.join(wd+"/data/UCSCData"))


def check_filter(tissue_cmd, org, wd):
path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" )
path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json")
tissues_config = []
if not tissue_cmd:
return False
Expand All @@ -36,18 +36,18 @@ def check_filter(tissue_cmd, org, wd):
return False


def main_script(org, wd, tissuetype=None):

check_for_local_folder(wd)
def main_script(org, wd, data_dir, tissuetype=None):
if not data_dir:
check_for_local_folder(wd)
if check_filter(tissuetype, org, wd):
tissues = tissuetype
print("Filter detected !")
else:
tissues = None
print("Filter not detected !")

ucsc = UcscGtf(org, wd)
ense = Ensembl(org, wd)
ucsc = UcscGtf(org, wd, data_dir)
ense = Ensembl(org, wd, data_dir)
print("Getting Unique Results")
unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues)
ResultSaver(unique_filter.get_results(), org, tissues)
Expand All @@ -57,12 +57,12 @@ def main_script(org, wd, tissuetype=None):
parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' )
parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str)
parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str)
parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str)
parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str)
parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str)
args = vars(parser.parse_args())
print("Working Dir: " + args["wd"])
if args["organism"]:
print("Working Dir: " + args["wd"])
main_script(args["organism"], args["wd"], args["tissue"])
main_script(args["organism"], args["wd"], args["dir"], args["tissue"])
else:
print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.")

2 changes: 1 addition & 1 deletion pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ All arguments can be set in the configuration files.

bigwig_input.combine(bed_input).set{footprint_in}
/*
this process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest
*/
process footprint_extraction {
conda "${path_env}"
Expand Down

0 comments on commit 037b2bf

Please sign in to comment.