From c0c36bc66ab0c924c09c40c559ac090464bf99d0 Mon Sep 17 00:00:00 2001 From: basti Date: Thu, 3 Jan 2019 16:41:26 +0100 Subject: [PATCH] Added more comments and author / mail description --- bin/3.1_create_gtf/Modules/CrossMapper.py | 1 + bin/3.1_create_gtf/Modules/SaveResults.py | 19 +++++++++++++++-- bin/3.1_create_gtf/Modules/Uniquifier.py | 25 +++++++++++++++++++---- bin/3.1_create_gtf/RegGTFExtractor.py | 15 +++++++++++--- 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/bin/3.1_create_gtf/Modules/CrossMapper.py b/bin/3.1_create_gtf/Modules/CrossMapper.py index d401c79..7e8c259 100644 --- a/bin/3.1_create_gtf/Modules/CrossMapper.py +++ b/bin/3.1_create_gtf/Modules/CrossMapper.py @@ -20,6 +20,7 @@ def __init__(self, org, wd, out, is_dir): self.infile = os.path.join(wd+"/data/temp/"+org+".gtf") self.outfile = os.path.join(out+"/" + org + "_mapped.gtf") self.chainfile = self.get_chain_file(org, wd, is_dir) + # Execute Crossmapper for gff/gtf files (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) diff --git a/bin/3.1_create_gtf/Modules/SaveResults.py b/bin/3.1_create_gtf/Modules/SaveResults.py index 66c2d97..ec5ed61 100644 --- a/bin/3.1_create_gtf/Modules/SaveResults.py +++ b/bin/3.1_create_gtf/Modules/SaveResults.py @@ -5,20 +5,35 @@ class ResultSaver: """ - class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False. + Class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False. The output is saved to the temp directory in the data folder if crossmapping is necessary. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): + # Constructor and main method for result-saving + # input_parameter: results = finished list of gtf-entries + # organism = input_organism + # wd = working directory + # mapped = boolean if crossmapping is necessary + # is_data_dir = boolean if wd is a data_dir (true) or not (false) + # is_data_dir = boolean if wd is a data_dir (true) or not (false) + + + # TODO: DATADIR !! + print("Save results to File !") self.path = "" if mapped: if is_data_dir: self.path = os.path.join(wd + "/temp/" + organism + ".gtf") else: - self.path = os.path.join( wd + "/data/temp/" + organism + ".gtf" ) + self.path = os.path.join(wd + "/data/temp/" + organism + ".gtf" ) elif tissue: self.path = os.path.join(out+"/"+organism+"_filtered.gtf") else: diff --git a/bin/3.1_create_gtf/Modules/Uniquifier.py b/bin/3.1_create_gtf/Modules/Uniquifier.py index 16d5538..89890c8 100644 --- a/bin/3.1_create_gtf/Modules/Uniquifier.py +++ b/bin/3.1_create_gtf/Modules/Uniquifier.py @@ -1,23 +1,38 @@ class UniqueFilter: - """ Class to get unique GTF-results, filtered by specified cell-/tissuetypes + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, ense, ucsc, org_filter=None): + + # Constructor + # input_parameter: ense = list of gtf-formatted entries from Ensembl data + # ucsc = list of gtf-formatted entries from UCSC data + # org_filter = filter for specific celltype + self.results = self.get_filtered_results(org_filter, ense, ucsc) def get_results(self): + + # Getter method for results variable + return self.results def get_filtered_results(self, org_filter, ense, ucsc): - # Apply Filter + # Method to concat ucsc and ensemble dataset without duplicates and filter by activitylist + # input_parameter: ense = list of gtf-formatted entries from Ensembl data + # ucsc = list of gtf-formatted entries from UCSC data + # org_filter = filter for specific celltype + # return_value: List of unique (filtered) results. - unfiltered_results = self.concat_without_duplicates(ense, ucsc) - if org_filter: + unfiltered_results = self.concat_without_duplicates(ense, ucsc) # First: Concat ucsc and ensembl data + if org_filter: # Second: apply filter if specified filterstrings = [x+">ACTIVE" for x in org_filter] return_list = [] for element in unfiltered_results: @@ -32,6 +47,8 @@ def get_filtered_results(self, org_filter, ense, ucsc): def concat_without_duplicates(ense, ucsc): # Concat UCSC and Ensembl data without duplicates + # input_parameter: ense = ensembl-gtf-data and ucsc = ucsc-gtf-data + # return_value: concatinated list of gtf-entries without duplicates results = ense+ucsc for ens in ense: diff --git a/bin/3.1_create_gtf/RegGTFExtractor.py b/bin/3.1_create_gtf/RegGTFExtractor.py index 355d4da..1c6a05f 100644 --- a/bin/3.1_create_gtf/RegGTFExtractor.py +++ b/bin/3.1_create_gtf/RegGTFExtractor.py @@ -1,7 +1,7 @@ """ RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases -and converts output to GTF-formatted file. +and converts the output to a GTF-formatted file. @author: Sebastian Beyvers @contact: sebastian.beyvers@med.uni-giessen.de @@ -21,6 +21,8 @@ def check_for_local_folder(wd): # Check if local folder exists and create if missing when no data_dir is specified + # input_parameter: wd = working directory + # return_value: None if not os.path.isdir(os.path.join(wd+"/data/")): @@ -40,6 +42,8 @@ def check_for_local_folder(wd): def check_for_data_dir(data_dir): # Check if local folder exists and create if missing when data_dir as parameter is specified + # input_parameter: data_dir = data directory + # return_value: None if not os.path.isdir(os.path.join(data_dir)): os.mkdir(os.path.join(data_dir)) @@ -57,6 +61,8 @@ def check_for_data_dir(data_dir): def check_filter(tissue_cmd, org, wd): # Checks if filter-celltype is in Json types for organism + # input_parameter: tissue_cmd: Filtered tissuetypes; org = organism; wd = working directory + # return_value: boolean if selected filter is in config path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") tissues_config = [] @@ -77,6 +83,8 @@ def check_filter(tissue_cmd, org, wd): def check_organism(org): # Checks the organism input and decides if chrossmapping is necessary + # input_parameter: org = input organism (parameter) + # return_value: tuple with values = (organism_alias (string), boolean if chrossmapping is needed) if org == "hg38": return "homo_sapiens", False @@ -92,7 +100,8 @@ def check_organism(org): def main_script(organism, wd, data_dir, out, tissuetype=None): - # Main function + # main function + # input_parameter: all parameters from argparse (org, x_mappable) = check_organism(organism) if not data_dir: @@ -124,7 +133,7 @@ def main_script(organism, wd, data_dir, out, tissuetype=None): if __name__ == '__main__': - # Argumentparser + # argument parser parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str)