Skip to content
Permalink
b7c80c8559
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
147 lines (105 sloc) 4.79 KB
"""
RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases
and converts output to GTF-formatted file.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""
import argparse
from Modules.Ensembl.Ensembl import Ensembl
from Modules.ucsc.ucsc import UcscGtf
from Modules.Uniquifier import UniqueFilter
from Modules.SaveResults import ResultSaver
from Modules.CrossMapper import CrossMapper
import os
import json
def check_for_local_folder(wd):
# Check if local folder exists and create if missing when no data_dir is specified
if not os.path.isdir(os.path.join(wd+"/data/")):
os.mkdir(os.path.join(wd+"/data/"))
if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")):
os.mkdir(os.path.join(wd+"/data/EnsemblData"))
if not os.path.isdir(os.path.join(wd+"/data/UCSCData")):
os.mkdir(os.path.join(wd+"/data/UCSCData"))
if not os.path.isdir(os.path.join(wd+"/data/temp")):
os.mkdir(os.path.join(wd+"/data/temp"))
def check_for_data_dir(data_dir):
# Check if local folder exists and create if missing when data_dir as parameter is specified
if not os.path.isdir(os.path.join(data_dir)):
os.mkdir(os.path.join(data_dir))
if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")):
os.mkdir(os.path.join(data_dir+"/EnsemblData"))
if not os.path.isdir(os.path.join(data_dir+"/UCSCData")):
os.mkdir(os.path.join(data_dir + "/UCSCData"))
if not os.path.isdir(os.path.join(data_dir+"/temp")):
os.mkdir(os.path.join(data_dir+"/temp"))
def check_filter(tissue_cmd, org, wd):
# Checks if filter-celltype is in Json types for organism
path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json")
tissues_config = []
if not tissue_cmd:
return False
with open(path_to_config) as input_file:
data = json.loads(input_file.read())
for x in data:
tissues_config.append(x["type"])
if any(tissue in tissues_config for tissue in tissue_cmd):
return True
else:
return False
def check_organism(org):
# Checks the organism input and decides if chrossmapping is necessary
if org == "hg38":
return "homo_sapiens", False
if org == "hg19":
print("Older assembly Version detected: hg19 -> Crossmapping result from hg38")
return "homo_sapiens", True
elif org == "mm10":
return "mus_musculus", False
elif org == "mm9":
print("Older assembly Version detected: mm9 -> Crossmapping result from mm10")
return "mus_musculus", True
def main_script(organism, wd, data_dir, out, tissuetype=None):
# Main function
(org, x_mappable) = check_organism(organism)
if not data_dir:
check_for_local_folder(wd)
else:
check_for_data_dir(data_dir)
if check_filter(tissuetype, org, wd):
tissues = tissuetype
print("Filter detected !")
else:
tissues = None
print("Filter not detected !")
# Get UCSC Data
ucsc = UcscGtf(org, wd, data_dir)
# Gen Ensembl Data
ense = Ensembl(org, wd, data_dir)
print("Getting Unique Results")
unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues)
if data_dir:
ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, tissues, out)
if x_mappable:
CrossMapper(organism, data_dir, out, True)
else:
ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, tissues, out)
if x_mappable:
CrossMapper(organism, wd, out, False)
if __name__ == '__main__':
# Argumentparser
parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' )
parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str)
parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str)
parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str)
parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str)
parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str)
args = vars(parser.parse_args())
# Check if organism exists
if args["organism"]:
if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]:
print("Working Dir: " + args["wd"])
main_script(args["organism"], args["wd"], args["dir"], args["out"], args["tissue"])
else:
print("Invalid Organism: " + args["organism"] + " see -h for help")
else:
print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.")