Skip to content

Gtf creation #40

Merged
merged 21 commits into from
Jan 8, 2019
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 30 additions & 7 deletions bin/3.1_create_gtf/Modules/CrossMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,49 @@ class CrossMapper:
Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions.
Utilizes CrossMap.py. see wiki for more information.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, org, wd, out, is_dir):
self.org = org

# Constructor for CrossMapper class
# input_parameter: org = input organism
# wd = working directory
# out = path to output-file -> Parameter
# is_dir = boolean if wd is data_dir or just working directory

# Get path to tempfile / outputfile and chain-file

if is_dir:
self.infile = os.path.join( wd + "/temp/" + org + ".gtf")
self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
else:
self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
self.outfile = os.path.join(out+"/" + org + "_mapped.gtf")
self.outfile = os.path.join(out)
self.chainfile = self.get_chain_file(org, wd, is_dir)
# Execute Crossmapper for gff/gtf files

# Execute Crossmap for gff/gtf files

(mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)

# Map results and save output to self.outfile

CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)

def get_chain_file(self, org, wd, isdir):
def get_chain_file(self, org, wd, is_data_dir):

# Defines the Chain files for different conversions.
# input_parameter: org = organism
# wd = working directory
# is_data_dir = is wd data_dir or not

# return_value: Link to chain-file for conversion.
# Custom chain-files and chain-files for more organism can be specified in this section

if org == "hg19":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand All @@ -39,7 +62,7 @@ def get_chain_file(self, org, wd, isdir):
return file_link

elif org == "mm9":
if isdir:
if is_data_dir:
file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz")
else:
file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
Expand Down
43 changes: 41 additions & 2 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,22 @@

class ActivityCategorizer:

"""

Class that categorizes activitydata based on json config and binary activitydata (table.bin).
SebastianBeyvers marked this conversation as resolved.
Show resolved Hide resolved
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de

"""

def __init__(self, release, organism, wd, data_dir):

# Constructor for ActivityCategorizer
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)

# List of all Folders with Activity Tables

self.folderlist = []
Expand All @@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir):
print("Categorization finished !")

def get_categorization(self):

# Getter method to return the self.categorization variable

return self.categorization

def read_config(self, organism, wd):

# Method to read the celltypes_organism.json config file
# input_parameter: organism = input organism
# wd = working directory to find the config files.
# return_value: Dictionary with ensembl aliases based on config
# -> Key = type (from config), value = list of ensembl aliases

c_dict = {}
path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json")
with open(path_to_config) as input_file:
Expand All @@ -43,6 +66,13 @@ def read_config(self, organism, wd):

def get_activity_data(self, release, organism, wd, data_dir):

# Method to read the binary table.bin file and return its content as bytearray
# input_parameter: organism = input organism
# release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)
# return_value: bytearray with activitystatus

for folder in self.folderlist:
# Generate path to binary File
if data_dir:
Expand All @@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir):
self.activity[folder] = bytearray(tables.read())

def generate_categorized_activity(self):

# Categorizes the Activity by config defined categories.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

activity


category_activity = {}

for category, aliases in self.c_dict.items():
Expand Down Expand Up @@ -80,10 +113,16 @@ def generate_categorized_activity(self):

def activity_comparator(self, aliaslist):

# Method to determine the resulting activitystatus if the entry contains
# multiple differing activitystatus from aliases
# e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info
# input_parameter: aliaslist = list of aliases for activity_data
# return_value: Array of Activitystatus by category (type in config)

concatenated_array = bytearray([])

length = len(self.activity[aliaslist[0]])
input_arrays = [self.activity[x] for x in aliaslist]
input_arrays = [self.activity[index] for index in aliaslist]
for x in range(length):
if any(y[x] == 0 for y in input_arrays):
concatenated_array.append(0)
Expand All @@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist):
# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
# print(len(e.categorization))
# for x in e.categorization.values():
# print(len(x))
SebastianBeyvers marked this conversation as resolved.
Show resolved Hide resolved
# print(len(x))
36 changes: 32 additions & 4 deletions bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,64 @@ class ActivityTable:
Class for checking activity_table and generating them.
activityTable = byte Representation of activity status
corresponding to the generator schema default:
0, "activity=ACTIVE",
1, "activity=POISED",
2, "activity=REPRESSED",
3, "activity=INACTIVE",
4, "activity=NA"
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, organism, current_release, wd, data_dir):

# Constructor for the ActivityTable-Class
# input_parameter: organism = input organism
# current_release = current used Ensembl release
# wd = working dir (default working directory, data_dir is used if specified)
# data_dir = data directory (this is used as directory if specified)

if data_dir:
self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity")
else:
self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity")
self.folders = next(os.walk(self.link))[1]

# List to represent Index with activitystatus for ATGenerator class

self.generator = ATGenerator(["activity=ACTIVE",
"activity=POISED",
"activity=REPRESSED",
"activity=INACTIVE",
"activity=NA"])

def check_and_generate_activity_table(self):
# checks if file already exists and generates new one if missing

# checks if file (table.bin) already exists for celltype -> generates new one if missing

for subfolder in self.folders:
folder_link = os.path.join(self.link, subfolder)
sf_link = os.path.join(folder_link, "table.bin")

# If table.bin is missing:

if not os.path.isfile(sf_link):
print("No ActivityTable for "+subfolder+" found, generating new one.")
self.generate_table(folder_link)

# Else: Do nothing

print("All ActivityTables found, proceeding")

def generate_table(self, link):

# generates the table and saves it as table.bin file
# input_parameter: link = link to ensembl activity folder for specific celltype
# generates table.bin file in link folder

for root, dirs, files in os.walk(link):
for file in files:
if file.endswith(".gff.gz"):
Expand All @@ -47,7 +74,8 @@ def generate_table(self, link):
with open(file_path, "wb") as f:
f.write(self.generator.read_table(originpath))
print("New ActivityTable generated in: " + root)
# Debug

#e = ActivityTable("homo_sapiens", "release-94")
#e.check_and_generate_activity_table()

# Debug
# e = ActivityTable("homo_sapiens", "release-94")
# e.check_and_generate_activity_table()
SebastianBeyvers marked this conversation as resolved.
Show resolved Hide resolved
14 changes: 13 additions & 1 deletion bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,26 @@
class ATGenerator:

"""
Reads saved activity binary files (table.bin)

Reads gzip files for activitydata and generates a bytearray with activitystatus.
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de

"""

def __init__(self, repre):

# Constructor with parameter representation: List of keywords with a corresponding index
# Only the Index as byte will be appended to a bytearray.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

index


self.representation = repre

def read_table(self, file):

# Reads the activity-ensembl file ("/data/EnsemblData/release/organism/activity/*")
# and returns the activity as bytearray, based on the representation in self.representation.
# Only the index is saved -> see ActivityTable.py for current representation.

activity_table = []
with gzip.open(file, 'rb') as f:
for line in f:
Expand Down
18 changes: 18 additions & 0 deletions bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,41 @@
class Ensembl:

"""
Main class for handling Ensembl Regulatory data
Checks for local files and downloads if files are missing
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de
"""

def __init__(self, organism, wd, data_dir):

# Constructor and main method for Ensembl-GTF-Creation
# input_parameter: organism = input organism
# wd = working directory
# data_dir = use data_dir parameter if specified.

print("Starting Ensembl")
# Check and Update for Local Ensembl Release Data
self.updater = FTPRetriever(organism, wd, data_dir)
self.release = self.updater.get_release()
# Check for Activitytables (table.bin binary files) and generate if not existing
self.acttable = ActivityTable(organism, self.release, wd, data_dir)
self.acttable.check_and_generate_activity_table()
# Categorize the Activitytable by config defined categories (config: ./config/celltypes_organism.json)
self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir)
print("Generating GTF")
# Instatiate
self.gtf_generator = GTFGen(organism, self.release, wd, data_dir)

print("Ensembl Finished !")

def get_gtf(self):

# Getter Method for resulting GTF-Entries as List.
# return_value: list of gtf entries.

return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization())

#e = Ensembl("homo_sapiens")
Expand Down
28 changes: 27 additions & 1 deletion bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,51 @@
class FTPEntry:

"""
Class to determine if a ftp-path is file or directory.

Class to determine if a ftp-path is file or directory.
Assigns every filename a parameter ('d' = directory or 'f' = file)
@author: Sebastian Beyvers
@contact: sebastian.beyvers@med.uni-giessen.de

"""

def __init__(self, filename, ftpobj, startingdir=None):

# Constructor
# input_parameter: filename = the files/directorys name
# ftpobj = Current Instance of FTPLib from URLRetrieve
# startingdir = optional parameter for ftp starting directory (to reduce browsing in ftp)

self.filename = filename
if startingdir is None:
startingdir = ftpobj.pwd()

# Try if "filename" is directory

try:
ftpobj.cwd(filename)
self.filetype = 'd'
ftpobj.cwd(startingdir)

# If error_perm occurs filename is file not directory

except ftplib.error_perm:
self.filetype = 'f'

def gettype(self):

# Getter method for filetype

return self.filetype

def getfilename(self):

# Getter method for filenname

return self.filename

def __repr__(self):

# Change the represenation scheme for FTPEntry object.

return self.filename, self.filetype