Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
master_project_JLU2018/bin/3.1_create_gtf/RegGTFExtractor.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
147 lines (105 sloc)
4.79 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases | |
and converts output to GTF-formatted file. | |
@author: Sebastian Beyvers | |
@contact: sebastian.beyvers@med.uni-giessen.de | |
""" | |
import argparse | |
from Modules.Ensembl.Ensembl import Ensembl | |
from Modules.ucsc.ucsc import UcscGtf | |
from Modules.Uniquifier import UniqueFilter | |
from Modules.SaveResults import ResultSaver | |
from Modules.CrossMapper import CrossMapper | |
import os | |
import json | |
def check_for_local_folder(wd): | |
# Check if local folder exists and create if missing when no data_dir is specified | |
if not os.path.isdir(os.path.join(wd+"/data/")): | |
os.mkdir(os.path.join(wd+"/data/")) | |
if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): | |
os.mkdir(os.path.join(wd+"/data/EnsemblData")) | |
if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): | |
os.mkdir(os.path.join(wd+"/data/UCSCData")) | |
if not os.path.isdir(os.path.join(wd+"/data/temp")): | |
os.mkdir(os.path.join(wd+"/data/temp")) | |
def check_for_data_dir(data_dir): | |
# Check if local folder exists and create if missing when data_dir as parameter is specified | |
if not os.path.isdir(os.path.join(data_dir)): | |
os.mkdir(os.path.join(data_dir)) | |
if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): | |
os.mkdir(os.path.join(data_dir+"/EnsemblData")) | |
if not os.path.isdir(os.path.join(data_dir+"/UCSCData")): | |
os.mkdir(os.path.join(data_dir + "/UCSCData")) | |
if not os.path.isdir(os.path.join(data_dir+"/temp")): | |
os.mkdir(os.path.join(data_dir+"/temp")) | |
def check_filter(tissue_cmd, org, wd): | |
# Checks if filter-celltype is in Json types for organism | |
path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") | |
tissues_config = [] | |
if not tissue_cmd: | |
return False | |
with open(path_to_config) as input_file: | |
data = json.loads(input_file.read()) | |
for x in data: | |
tissues_config.append(x["type"]) | |
if any(tissue in tissues_config for tissue in tissue_cmd): | |
return True | |
else: | |
return False | |
def check_organism(org): | |
# Checks the organism input and decides if chrossmapping is necessary | |
if org == "hg38": | |
return "homo_sapiens", False | |
if org == "hg19": | |
print("Older assembly Version detected: hg19 -> Crossmapping result from hg38") | |
return "homo_sapiens", True | |
elif org == "mm10": | |
return "mus_musculus", False | |
elif org == "mm9": | |
print("Older assembly Version detected: mm9 -> Crossmapping result from mm10") | |
return "mus_musculus", True | |
def main_script(organism, wd, data_dir, out, tissuetype=None): | |
# Main function | |
(org, x_mappable) = check_organism(organism) | |
if not data_dir: | |
check_for_local_folder(wd) | |
else: | |
check_for_data_dir(data_dir) | |
if check_filter(tissuetype, org, wd): | |
tissues = tissuetype | |
print("Filter detected !") | |
else: | |
tissues = None | |
print("Filter not detected !") | |
# Get UCSC Data | |
ucsc = UcscGtf(org, wd, data_dir) | |
# Gen Ensembl Data | |
ense = Ensembl(org, wd, data_dir) | |
print("Getting Unique Results") | |
unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) | |
if data_dir: | |
ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, tissues, out) | |
if x_mappable: | |
CrossMapper(organism, data_dir, out, True) | |
else: | |
ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, tissues, out) | |
if x_mappable: | |
CrossMapper(organism, wd, out, False) | |
if __name__ == '__main__': | |
# Argumentparser | |
parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) | |
parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) | |
parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) | |
parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) | |
parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) | |
parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str) | |
args = vars(parser.parse_args()) | |
# Check if organism exists | |
if args["organism"]: | |
if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]: | |
print("Working Dir: " + args["wd"]) | |
main_script(args["organism"], args["wd"], args["dir"], args["out"], args["tissue"]) | |
else: | |
print("Invalid Organism: " + args["organism"] + " see -h for help") | |
else: | |
print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") |