Skip to content

Commit

Permalink
Added more comments and author / mail description
Browse files Browse the repository at this point in the history
  • Loading branch information
basti committed Jan 3, 2019
1 parent 8993670 commit c0c36bc
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 9 deletions.
1 change: 1 addition & 0 deletions bin/3.1_create_gtf/Modules/CrossMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, org, wd, out, is_dir):
self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
self.outfile = os.path.join(out+"/" + org + "_mapped.gtf")
self.chainfile = self.get_chain_file(org, wd, is_dir)

# Execute Crossmapper for gff/gtf files
(mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)
CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)
Expand Down
19 changes: 17 additions & 2 deletions bin/3.1_create_gtf/Modules/SaveResults.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,35 @@ class ResultSaver:

"""
class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False.
Class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False.
The output is saved to the temp directory in the data folder if crossmapping is necessary.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out):

# Constructor and main method for result-saving
# input_parameter: results = finished list of gtf-entries
# organism = input_organism
# wd = working directory
# mapped = boolean if crossmapping is necessary
# is_data_dir = boolean if wd is a data_dir (true) or not (false)
# is_data_dir = boolean if wd is a data_dir (true) or not (false)


# TODO: DATADIR !!

print("Save results to File !")
self.path = ""
if mapped:
if is_data_dir:
self.path = os.path.join(wd + "/temp/" + organism + ".gtf")
else:
self.path = os.path.join( wd + "/data/temp/" + organism + ".gtf" )
self.path = os.path.join(wd + "/data/temp/" + organism + ".gtf" )
elif tissue:
self.path = os.path.join(out+"/"+organism+"_filtered.gtf")
else:
Expand Down
25 changes: 21 additions & 4 deletions bin/3.1_create_gtf/Modules/Uniquifier.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,38 @@
class UniqueFilter:

"""
Class to get unique GTF-results, filtered by specified cell-/tissuetypes
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, ense, ucsc, org_filter=None):

# Constructor
# input_parameter: ense = list of gtf-formatted entries from Ensembl data
# ucsc = list of gtf-formatted entries from UCSC data
# org_filter = filter for specific celltype

self.results = self.get_filtered_results(org_filter, ense, ucsc)

def get_results(self):

# Getter method for results variable

return self.results

def get_filtered_results(self, org_filter, ense, ucsc):

# Apply Filter
# Method to concat ucsc and ensemble dataset without duplicates and filter by activitylist
# input_parameter: ense = list of gtf-formatted entries from Ensembl data
# ucsc = list of gtf-formatted entries from UCSC data
# org_filter = filter for specific celltype
# return_value: List of unique (filtered) results.

unfiltered_results = self.concat_without_duplicates(ense, ucsc)
if org_filter:
unfiltered_results = self.concat_without_duplicates(ense, ucsc) # First: Concat ucsc and ensembl data
if org_filter: # Second: apply filter if specified
filterstrings = [x+">ACTIVE" for x in org_filter]
return_list = []
for element in unfiltered_results:
Expand All @@ -32,6 +47,8 @@ def get_filtered_results(self, org_filter, ense, ucsc):
def concat_without_duplicates(ense, ucsc):

# Concat UCSC and Ensembl data without duplicates
# input_parameter: ense = ensembl-gtf-data and ucsc = ucsc-gtf-data
# return_value: concatinated list of gtf-entries without duplicates

results = ense+ucsc
for ens in ense:
Expand Down
15 changes: 12 additions & 3 deletions bin/3.1_create_gtf/RegGTFExtractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases
and converts output to GTF-formatted file.
and converts the output to a GTF-formatted file.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
Expand All @@ -21,6 +21,8 @@
def check_for_local_folder(wd):

# Check if local folder exists and create if missing when no data_dir is specified
# input_parameter: wd = working directory
# return_value: None

if not os.path.isdir(os.path.join(wd+"/data/")):

Expand All @@ -40,6 +42,8 @@ def check_for_local_folder(wd):
def check_for_data_dir(data_dir):

# Check if local folder exists and create if missing when data_dir as parameter is specified
# input_parameter: data_dir = data directory
# return_value: None

if not os.path.isdir(os.path.join(data_dir)):
os.mkdir(os.path.join(data_dir))
Expand All @@ -57,6 +61,8 @@ def check_for_data_dir(data_dir):
def check_filter(tissue_cmd, org, wd):

# Checks if filter-celltype is in Json types for organism
# input_parameter: tissue_cmd: Filtered tissuetypes; org = organism; wd = working directory
# return_value: boolean if selected filter is in config

path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json")
tissues_config = []
Expand All @@ -77,6 +83,8 @@ def check_filter(tissue_cmd, org, wd):
def check_organism(org):

# Checks the organism input and decides if chrossmapping is necessary
# input_parameter: org = input organism (parameter)
# return_value: tuple with values = (organism_alias (string), boolean if chrossmapping is needed)

if org == "hg38":
return "homo_sapiens", False
Expand All @@ -92,7 +100,8 @@ def check_organism(org):

def main_script(organism, wd, data_dir, out, tissuetype=None):

# Main function
# main function
# input_parameter: all parameters from argparse

(org, x_mappable) = check_organism(organism)
if not data_dir:
Expand Down Expand Up @@ -124,7 +133,7 @@ def main_script(organism, wd, data_dir, out, tissuetype=None):

if __name__ == '__main__':

# Argumentparser
# argument parser

parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' )
parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str)
Expand Down

0 comments on commit c0c36bc

Please sign in to comment.