Skip to content

Gtf creation #40

Merged
merged 21 commits into from
Jan 8, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions bin/3.1_create_gtf/Modules/CrossMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, org, wd, out, is_dir):
# out = path to output-file -> Parameter
# is_dir = boolean if wd is data_dir or just working directory

# Get path to tempfile / outputfile and chainfile
# Get path to tempfile / outputfile and chain-file

if is_dir:
self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
Expand All @@ -48,8 +48,8 @@ def get_chain_file(self, org, wd, is_data_dir):
# wd = working directory
# is_data_dir = is wd data_dir or not

# return_value: Link to Chainfile for conversion.
# Custom chainfiles and chainfiles for more organism can be specified in this section
# return_value: Link to chain-file for conversion.
# Custom chain-files and chain-files for more organism can be specified in this section

if org == "hg19":
if is_data_dir:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,11 @@ def download_currentversion_version(self, version, organism, wd, data_dir):
# Iterate over Celltype List and Download in corresponding subfolder

for celltype in celltypes_list:
link_local = os.path.join(activityfolder_local, celltype)
link_local = os.pathchainfil.join(activityfolder_local, celltype)
SebastianBeyvers marked this conversation as resolved.
Show resolved Hide resolved
link_origin = activityfolder_remote+"/"+celltype
os.mkdir(link_local)
self.site_ftp.save_entries_to_file(link_origin, link_local)


# Debug section
# e = EnsemblRegulationFTPRetriever("mus_musculus")
# e = EnsemblRegulationFTPRetriever("mus_musculus")
2 changes: 1 addition & 1 deletion bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, organism, release, wd, data_dir):
# Constructor for GTFGen
# input_parameter: organism = input organism
# release = used Ensembl release
# wd = working directory (default is ".")
# wd = working directory (default is "."), this is used if data_dir is not specified.
# data_dir = data directory (if specified this is used)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you should write what is used if it is not specified

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant data_dir


self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir)
Expand Down
4 changes: 1 addition & 3 deletions bin/3.1_create_gtf/Modules/SaveResults.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ class ResultSaver:

"""

Class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False.
The output is saved to the temp directory in the data folder if crossmapping is necessary.
Class to save the results. The output is saved to the temp directory in the data folder if crossmapping is necessary.

@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
Expand All @@ -22,7 +21,6 @@ def __init__(self, results, organism, wd, mapped, is_data_dir, out):
# wd = working directory
# mapped = boolean if crossmapping is necessary
# is_data_dir = boolean if wd is a data_dir (true) or not (false)
# is_data_dir = boolean if wd is a data_dir (true) or not (false)

print("Save results to File !")
self.path = ""
Expand Down
6 changes: 3 additions & 3 deletions bin/3.1_create_gtf/Modules/Validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ class Validator:
def __init__(self, out_file):

# Constructor
# input_parameter: out_file = Path to Outputfile
# input_parameter: out_file = path to output file

self.out_file = out_file
self.test_read_file()

def test_read_file(self):

# Method to testread the file
# Method to test the output file-format

with open(self.out_file) as outfile:
line = outfile.readline()
Expand All @@ -32,4 +32,4 @@ def test_read_file(self):
exit(1)

# Debug
# v = Validator("/home/basti/Schreibtisch/test_hg38.gtf")
# v = Validator("/home/basti/Schreibtisch/test_hg38.gtf")
14 changes: 7 additions & 7 deletions bin/3.1_create_gtf/Modules/ucsc/ucsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ def __init__(self, org, wd, data_dir):

def generate_gff_file(self):

# Call bigBedToBed binary to get a Bed file in the UCSCData folder
# Call bigBedToBed binary to get a BED-file in the UCSCData folder

callstring = [self.path_to_bin, self.link, self.output]
subprocess.call(callstring)

def read_gff_to_gtf(self):

# Reads Bed File and return a gtf-formatted list of elements.
# Reads BED-file and return a GTF-formatted list of elements.
# return_value: GTF-formatted List of regulation entries from UCSC

gtf_lines = []
Expand All @@ -75,8 +75,8 @@ def read_gff_to_gtf(self):
def find_ID(self, line):

# Find RefSeq ID in Line
# input_parameter: line = current line from bedfile
# return_value: string with gene_id in gtf-format
# input_parameter: line = current line from BED-file
# return_value: string with gene_id in GTF-format

pattern = re.compile(r'ID:[0-9]{,9}|$')
ref_id = re.search(pattern, line).group()
Expand All @@ -90,8 +90,8 @@ def find_ID(self, line):

def get_activity(self, line):

# Find activity categories in bed file
# input_parameter: line = current line from bedfile
# Find activity categories in BED-file
# input_parameter: line = current line from BED-file
# return_value: list with activity for specified line("keystatus")

key_status = []
Expand Down Expand Up @@ -122,7 +122,7 @@ def get_activity_categories(organism, wd):

# Method to get ucsc-celltype categories from JSON config
# input_parameter: organism = organism parameter
# wd = working directory, to find config fil
# wd = working directory, to find config file
# return_value: List of categories from config.

path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json")
Expand Down