Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/3.1_create_gtf/RegGTFExtractor.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
167 lines (121 sloc)
5.52 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases | |
and converts the output to a GTF-formatted file. | |
@author: Sebastian Beyvers | |
@contact: sebastian.beyvers@med.uni-giessen.de | |
""" | |
import argparse | |
from Modules.Ensembl.Ensembl import Ensembl | |
from Modules.ucsc.ucsc import UcscGtf | |
from Modules.Uniquifier import UniqueFilter | |
from Modules.SaveResults import ResultSaver | |
from Modules.CrossMapper import CrossMapper | |
from Modules.Validator import Validator | |
import os | |
import json | |
def check_for_local_folder(wd): | |
# Check if local folder exists and create if missing when no data_dir is specified | |
# input_parameter: wd = working directory | |
# return_value: None | |
if not os.path.isdir(os.path.join(wd+"/data/")): | |
os.mkdir(os.path.join(wd+"/data/")) | |
if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): | |
os.mkdir(os.path.join(wd+"/data/EnsemblData")) | |
if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): | |
os.mkdir(os.path.join(wd+"/data/UCSCData")) | |
if not os.path.isdir(os.path.join(wd+"/data/temp")): | |
os.mkdir(os.path.join(wd+"/data/temp")) | |
def check_for_data_dir(data_dir): | |
# Check if local folder exists and create if missing when data_dir as parameter is specified | |
# input_parameter: data_dir = data directory | |
# return_value: None | |
if not os.path.isdir(os.path.join(data_dir)): | |
os.mkdir(os.path.join(data_dir)) | |
if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): | |
os.mkdir(os.path.join(data_dir+"/EnsemblData")) | |
if not os.path.isdir(os.path.join(data_dir+"/UCSCData")): | |
os.mkdir(os.path.join(data_dir + "/UCSCData")) | |
if not os.path.isdir(os.path.join(data_dir+"/temp")): | |
os.mkdir(os.path.join(data_dir+"/temp")) | |
def check_filter(tissue_cmd, org, wd): | |
# Checks if filter-celltype is in Json types for organism | |
# input_parameter: tissue_cmd: Filtered tissuetypes; org = organism; wd = working directory | |
# return_value: boolean if selected filter is in config | |
path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") | |
tissues_config = [] | |
if not tissue_cmd: | |
return False | |
with open(path_to_config) as input_file: | |
data = json.loads(input_file.read()) | |
for x in data: | |
tissues_config.append(x["type"]) | |
if any(tissue in tissues_config for tissue in tissue_cmd): | |
return True | |
else: | |
return False | |
def check_organism(org): | |
# Checks the organism input and decides if chrossmapping is necessary | |
# input_parameter: org = input organism (parameter) | |
# return_value: tuple with values = (organism_alias (string), boolean if chrossmapping is needed) | |
if org == "hg38": | |
return "homo_sapiens", False | |
if org == "hg19": | |
print("Older assembly Version detected: hg19 -> Crossmapping result from hg38") | |
return "homo_sapiens", True | |
elif org == "mm10": | |
return "mus_musculus", False | |
elif org == "mm9": | |
print("Older assembly Version detected: mm9 -> Crossmapping result from mm10") | |
return "mus_musculus", True | |
def main_script(organism, wd, data_dir, out, tissuetype=None): | |
# main function | |
# input_parameter: all parameters from argparse | |
# if no output parameter is given output file is "./organism.gtf" | |
if out == ".": | |
out = "./"+organism+".gtf" | |
(org, x_mappable) = check_organism(organism) | |
if not data_dir: | |
check_for_local_folder(wd) | |
else: | |
check_for_data_dir(data_dir) | |
if check_filter(tissuetype, org, wd): | |
tissues = tissuetype | |
print("Filter detected !") | |
else: | |
tissues = None | |
print("Filter not detected !") | |
# Get UCSC Data | |
ucsc = UcscGtf(org, wd, data_dir) | |
# Gen Ensembl Data | |
ense = Ensembl(org, wd, data_dir) | |
print("Getting Unique Results") | |
unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) | |
if data_dir: | |
ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, out) | |
if x_mappable: | |
CrossMapper(organism, data_dir, out, True) | |
else: | |
ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, out) | |
if x_mappable: | |
CrossMapper(organism, wd, out, False) | |
# Validate outputfile | |
Validator(out) | |
if __name__ == '__main__': | |
# argument parser | |
parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) | |
parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) | |
parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) | |
parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) | |
parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) | |
parser.add_argument('--out', help='Path to output file: default: "./organism.gtf"', action='store', default=".", type=str) | |
args = vars(parser.parse_args()) | |
# Check if organism exists | |
if args["organism"]: | |
if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]: | |
print("Working Dir: " + args["wd"]) | |
main_script(args["organism"], args["wd"], args["dir"], args["out"], args["tissue"]) | |
else: | |
print("Invalid Organism: " + args["organism"] + " see -h for help") | |
else: | |
print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") |