Skip to content

Commit

Permalink
Commented every function / method to resolve #29
Browse files Browse the repository at this point in the history
  • Loading branch information
basti committed Jan 3, 2019
1 parent 6016c18 commit 91c4ed5
Show file tree
Hide file tree
Showing 11 changed files with 309 additions and 51 deletions.
32 changes: 27 additions & 5 deletions bin/3.1_create_gtf/Modules/CrossMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,49 @@ class CrossMapper:
Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions.
Utilizes CrossMap.py. see wiki for more information.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, org, wd, out, is_dir):
self.org = org

# Constructor for CrossMapper class
# input_parameter: org = input organism
# wd = working directory
# out = path to output-file -> Parameter
# is_dir = boolean if wd is data_dir or just working directory

# Get path to tempfile / outputfile and chainfile

if is_dir:
self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
else:
self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
self.outfile = os.path.join(out)
self.chainfile = self.get_chain_file(org, wd, is_dir)

# Execute Crossmapper for gff/gtf files
# Execute Crossmap for gff/gtf files

(mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)

# Map results and save output to self.outfile

CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)

def get_chain_file(self, org, wd, isdir):
def get_chain_file(self, org, wd, is_data_dir):

# Defines the Chain files for different conversions.
# input_parameter: org = organism
# wd = working directory
# is_data_dir = is wd data_dir or not

# return_value: Link to Chainfile for conversion.
# Custom chainfiles and chainfiles for more organism can be specified in this section

if org == "hg19":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand All @@ -40,7 +62,7 @@ def get_chain_file(self, org, wd, isdir):
return file_link

elif org == "mm9":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand Down
43 changes: 41 additions & 2 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,22 @@

class ActivityCategorizer:

"""
Class that categorizes activitydata based on json config and binary activitydata (table.bin).
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, release, organism, wd, data_dir):

# Constructor for ActivityCategorizer
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)

# List of all Folders with Activity Tables

self.folderlist = []
Expand All @@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir):
print("Categorization finished !")

def get_categorization(self):

# Getter method to return the self.categorization variable

return self.categorization

def read_config(self, organism, wd):

# Method to read the celltypes_organism.json config file
# input_parameter: organism = input organism
# wd = working directory to find the config files.
# return_value: Dictionary with ensembl aliases based on config
# -> Key = type (from config), value = list of ensembl aliases

c_dict = {}
path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json")
with open(path_to_config) as input_file:
Expand All @@ -43,6 +66,13 @@ def read_config(self, organism, wd):

def get_activity_data(self, release, organism, wd, data_dir):

# Method to read the binary table.bin file and return its content as bytearray
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)
# return_value: bytearray with activitystatus

for folder in self.folderlist:
# Generate path to binary File
if data_dir:
Expand All @@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir):
self.activity[folder] = bytearray(tables.read())

def generate_categorized_activity(self):

# Categorizes the Activity by config defined categories.

category_activity = {}

for category, aliases in self.c_dict.items():
Expand Down Expand Up @@ -80,10 +113,16 @@ def generate_categorized_activity(self):

def activity_comparator(self, aliaslist):

# Method to determine the resulting activitystatus if the entry contains
# multiple differing activitystatus from aliases
# e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info
# input_parameter: aliaslist = list of aliases for activity_data
# return_value: Array of Activitystatus by category (type in config)

concatenated_array = bytearray([])

length = len(self.activity[aliaslist[0]])
input_arrays = [self.activity[x] for x in aliaslist]
input_arrays = [self.activity[index] for index in aliaslist]
for x in range(length):
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
Expand All @@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist):
# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
# print(len(e.categorization))
# for x in e.categorization.values():
# print(len(x))
# print(len(x))
36 changes: 32 additions & 4 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,64 @@ class ActivityTable:
Class for checking activity_table and generating them.
activityTable = byte Representation of activity status
corresponding to the generator schema default:
0, "activity=ACTIVE",
1, "activity=POISED",
2, "activity=REPRESSED",
3, "activity=INACTIVE",
4, "activity=NA"
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, organism, current_release, wd, data_dir):

# Constructor for the ActivityTable-Class
# input_parameter: organism = input organism
# current_release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)

if data_dir:
self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity")
else:
self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity")
self.folders = next(os.walk(self.link))[1]

# List to represent Index with activitystatus for ATGenerator class

self.generator = ATGenerator(["activity=ACTIVE",
"activity=POISED",
"activity=REPRESSED",
"activity=INACTIVE",
"activity=NA"])

def check_and_generate_activity_table(self):
# checks if file already exists and generates new one if missing

# checks if file (table.bin) already exists for celltype -> generates new one if missing

for subfolder in self.folders:
folder_link = os.path.join(self.link, subfolder)
sf_link = os.path.join(folder_link, "table.bin")

# If table.bin is missing:

if not os.path.isfile(sf_link):
print("No ActivityTable for "+subfolder+" found, generating new one.")
self.generate_table(folder_link)

# Else: Do nothing

print("All ActivityTables found, proceeding")

def generate_table(self, link):

# generates the table and saves it as table.bin file
# input_parameter: link = link to ensembl activity folder for specific celltype
# generates table.bin file in link folder

for root, dirs, files in os.walk(link):
for file in files:
if file.endswith(".gff.gz"):
Expand All @@ -47,7 +74,8 @@ def generate_table(self, link):
with open(file_path, "wb") as f:
f.write(self.generator.read_table(originpath))
print("New ActivityTable generated in: " + root)
# Debug

#e = ActivityTable("homo_sapiens", "release-94")
#e.check_and_generate_activity_table()

# Debug
# e = ActivityTable("homo_sapiens", "release-94")
# e.check_and_generate_activity_table()
14 changes: 13 additions & 1 deletion bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,26 @@
class ATGenerator:

"""
Reads saved activity binary files (table.bin)
Reads gzip files for activitydata and generates a bytearray with activitystatus.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, repre):

# Constructor with parameter representation: List of keywords with a corresponding index
# Only the Index as byte will be appended to a bytearray.

self.representation = repre

def read_table(self, file):

# Reads the activity-ensembl file ("/data/EnsemblData/release/organism/activity/*")
# and returns the activity as bytearray, based on the representation in self.representation.
# Only the index is saved -> see ActivityTable.py for current representation.

activity_table = []
with gzip.open(file, 'rb') as f:
for line in f:
Expand Down
18 changes: 18 additions & 0 deletions bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,41 @@
class Ensembl:

"""
Main class for handling Ensembl Regulatory data
Checks for local files and downloads if files are missing
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, organism, wd, data_dir):

# Constructor and main method for Ensembl-GTF-Creation
# input_parameter: organism = input organism
# wd = working directory
# data_dir = use data_dir parameter if specified.

print("Starting Ensembl")
# Check and Update for Local Ensembl Release Data
self.updater = FTPRetriever(organism, wd, data_dir)
self.release = self.updater.get_release()
# Check for Activitytables (table.bin binary files) and generate if not existing
self.acttable = ActivityTable(organism, self.release, wd, data_dir)
self.acttable.check_and_generate_activity_table()
# Categorize the Activitytable by config defined categories (config: ./config/celltypes_organism.json)
self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir)
print("Generating GTF")
# Instatiate
self.gtf_generator = GTFGen(organism, self.release, wd, data_dir)

print("Ensembl Finished !")

def get_gtf(self):

# Getter Method for resulting GTF-Entries as List.
# return_value: list of gtf entries.

return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization())

#e = Ensembl("homo_sapiens")
Expand Down
28 changes: 27 additions & 1 deletion bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,51 @@
class FTPEntry:

"""
Class to determine if a ftp-path is file or directory.
Class to determine if a ftp-path is file or directory.
Assigns every filename a parameter ('d' = directory or 'f' = file)
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, filename, ftpobj, startingdir=None):

# Constructor
# input_parameter: filename = the files/directorys name
# ftpobj = Current Instance of FTPLib from URLRetrieve
# startingdir = optional parameter for ftp starting directory (to reduce browsing in ftp)

self.filename = filename
if startingdir is None:
startingdir = ftpobj.pwd()

# Try if "filename" is directory

try:
ftpobj.cwd(filename)
self.filetype = 'd'
ftpobj.cwd(startingdir)

# If error_perm occurs filename is file not directory

except ftplib.error_perm:
self.filetype = 'f'

def gettype(self):

# Getter method for filetype

return self.filetype

def getfilename(self):

# Getter method for filenname

return self.filename

def __repr__(self):

# Change the represenation scheme for FTPEntry object.

return self.filename, self.filetype
Loading

0 comments on commit 91c4ed5

Please sign in to comment.