From 91c4ed531980b7e1b3e6ac78cf1664be6e7897c6 Mon Sep 17 00:00:00 2001 From: basti Date: Thu, 3 Jan 2019 19:03:28 +0100 Subject: [PATCH] Commented every function / method to resolve #29 --- bin/3.1_create_gtf/Modules/CrossMapper.py | 32 +++++++++++--- .../Modules/Ensembl/ActivityCategorizer.py | 43 +++++++++++++++++- .../Modules/Ensembl/ActivityTable.py | 36 +++++++++++++-- .../Modules/Ensembl/ActivityTableGenerator.py | 14 +++++- bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py | 18 ++++++++ .../Modules/Ensembl/FTPHandling/FTPEntry.py | 28 +++++++++++- .../Ensembl/FTPHandling/URLRetrieve.py | 36 +++++++++++++-- .../Ensembl/FTPHandling/VersionChecker.py | 43 ++++++++++++++++-- bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py | 44 ++++++++++++++++--- .../Modules/Ensembl/checksums.py | 23 ---------- bin/3.1_create_gtf/Modules/ucsc/ucsc.py | 43 +++++++++++++++++- 11 files changed, 309 insertions(+), 51 deletions(-) delete mode 100644 bin/3.1_create_gtf/Modules/Ensembl/checksums.py diff --git a/bin/3.1_create_gtf/Modules/CrossMapper.py b/bin/3.1_create_gtf/Modules/CrossMapper.py index 293783a..f4ae51c 100644 --- a/bin/3.1_create_gtf/Modules/CrossMapper.py +++ b/bin/3.1_create_gtf/Modules/CrossMapper.py @@ -10,10 +10,22 @@ class CrossMapper: Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions. Utilizes CrossMap.py. see wiki for more information. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ def __init__(self, org, wd, out, is_dir): - self.org = org + + # Constructor for CrossMapper class + # input_parameter: org = input organism + # wd = working directory + # out = path to output-file -> Parameter + # is_dir = boolean if wd is data_dir or just working directory + + # Get path to tempfile / outputfile and chainfile + if is_dir: self.infile = os.path.join(wd + "/temp/" + org + ".gtf") else: @@ -21,16 +33,26 @@ def __init__(self, org, wd, out, is_dir): self.outfile = os.path.join(out) self.chainfile = self.get_chain_file(org, wd, is_dir) - # Execute Crossmapper for gff/gtf files + # Execute Crossmap for gff/gtf files + (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) + + # Map results and save output to self.outfile + CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) - def get_chain_file(self, org, wd, isdir): + def get_chain_file(self, org, wd, is_data_dir): # Defines the Chain files for different conversions. + # input_parameter: org = organism + # wd = working directory + # is_data_dir = is wd data_dir or not + + # return_value: Link to Chainfile for conversion. + # Custom chainfiles and chainfiles for more organism can be specified in this section if org == "hg19": - if isdir: + if is_data_dir: file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz") else: file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) @@ -40,7 +62,7 @@ def get_chain_file(self, org, wd, isdir): return file_link elif org == "mm9": - if isdir: + if is_data_dir: file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz") else: file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py index bd82a92..af9f856 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py @@ -4,8 +4,22 @@ class ActivityCategorizer: + """ + + Class that categorizes activitydata based on json config and binary activitydata (table.bin). + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ + def __init__(self, release, organism, wd, data_dir): + # Constructor for ActivityCategorizer + # input_parameter: organism = input organism + # release = current used Ensembl release + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + # List of all Folders with Activity Tables self.folderlist = [] @@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir): print("Categorization finished !") def get_categorization(self): + + # Getter method to return the self.categorization variable + return self.categorization def read_config(self, organism, wd): + # Method to read the celltypes_organism.json config file + # input_parameter: organism = input organism + # wd = working directory to find the config files. + # return_value: Dictionary with ensembl aliases based on config + # -> Key = type (from config), value = list of ensembl aliases + c_dict = {} path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json") with open(path_to_config) as input_file: @@ -43,6 +66,13 @@ def read_config(self, organism, wd): def get_activity_data(self, release, organism, wd, data_dir): + # Method to read the binary table.bin file and return its content as bytearray + # input_parameter: organism = input organism + # release = current used Ensembl release + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + # return_value: bytearray with activitystatus + for folder in self.folderlist: # Generate path to binary File if data_dir: @@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir): self.activity[folder] = bytearray(tables.read()) def generate_categorized_activity(self): + + # Categorizes the Activity by config defined categories. + category_activity = {} for category, aliases in self.c_dict.items(): @@ -80,10 +113,16 @@ def generate_categorized_activity(self): def activity_comparator(self, aliaslist): + # Method to determine the resulting activitystatus if the entry contains + # multiple differing activitystatus from aliases + # e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info + # input_parameter: aliaslist = list of aliases for activity_data + # return_value: Array of Activitystatus by category (type in config) + concatenated_array = bytearray([]) length = len(self.activity[aliaslist[0]]) - input_arrays = [self.activity[x] for x in aliaslist] + input_arrays = [self.activity[index] for index in aliaslist] for x in range(length): if any(y[x] == 0 for y in input_arrays): concatenated_array.append(0) @@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist): # e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens") # print(len(e.categorization)) # for x in e.categorization.values(): -# print(len(x)) \ No newline at end of file +# print(len(x)) diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py index 288bf7d..1354356 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py @@ -8,19 +8,34 @@ class ActivityTable: Class for checking activity_table and generating them. activityTable = byte Representation of activity status corresponding to the generator schema default: + 0, "activity=ACTIVE", 1, "activity=POISED", 2, "activity=REPRESSED", 3, "activity=INACTIVE", 4, "activity=NA" + + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, current_release, wd, data_dir): + + # Constructor for the ActivityTable-Class + # input_parameter: organism = input organism + # current_release = current used Ensembl release + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + if data_dir: self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity") else: self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] + + # List to represent Index with activitystatus for ATGenerator class + self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", "activity=REPRESSED", @@ -28,17 +43,29 @@ def __init__(self, organism, current_release, wd, data_dir): "activity=NA"]) def check_and_generate_activity_table(self): - # checks if file already exists and generates new one if missing + + # checks if file (table.bin) already exists for celltype -> generates new one if missing + for subfolder in self.folders: folder_link = os.path.join(self.link, subfolder) sf_link = os.path.join(folder_link, "table.bin") + + # If table.bin is missing: + if not os.path.isfile(sf_link): print("No ActivityTable for "+subfolder+" found, generating new one.") self.generate_table(folder_link) + + # Else: Do nothing + print("All ActivityTables found, proceeding") def generate_table(self, link): + # generates the table and saves it as table.bin file + # input_parameter: link = link to ensembl activity folder for specific celltype + # generates table.bin file in link folder + for root, dirs, files in os.walk(link): for file in files: if file.endswith(".gff.gz"): @@ -47,7 +74,8 @@ def generate_table(self, link): with open(file_path, "wb") as f: f.write(self.generator.read_table(originpath)) print("New ActivityTable generated in: " + root) -# Debug -#e = ActivityTable("homo_sapiens", "release-94") -#e.check_and_generate_activity_table() \ No newline at end of file + +# Debug +# e = ActivityTable("homo_sapiens", "release-94") +# e.check_and_generate_activity_table() diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py index f110236..8793367 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py @@ -4,14 +4,26 @@ class ATGenerator: """ - Reads saved activity binary files (table.bin) + + Reads gzip files for activitydata and generates a bytearray with activitystatus. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, repre): + # Constructor with parameter representation: List of keywords with a corresponding index + # Only the Index as byte will be appended to a bytearray. + self.representation = repre def read_table(self, file): + + # Reads the activity-ensembl file ("/data/EnsemblData/release/organism/activity/*") + # and returns the activity as bytearray, based on the representation in self.representation. + # Only the index is saved -> see ActivityTable.py for current representation. + activity_table = [] with gzip.open(file, 'rb') as f: for line in f: diff --git a/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py b/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py index 88adeb1..8ab23ac 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py @@ -7,23 +7,41 @@ class Ensembl: """ + Main class for handling Ensembl Regulatory data Checks for local files and downloads if files are missing + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, wd, data_dir): + + # Constructor and main method for Ensembl-GTF-Creation + # input_parameter: organism = input organism + # wd = working directory + # data_dir = use data_dir parameter if specified. + print("Starting Ensembl") + # Check and Update for Local Ensembl Release Data self.updater = FTPRetriever(organism, wd, data_dir) self.release = self.updater.get_release() + # Check for Activitytables (table.bin binary files) and generate if not existing self.acttable = ActivityTable(organism, self.release, wd, data_dir) self.acttable.check_and_generate_activity_table() + # Categorize the Activitytable by config defined categories (config: ./config/celltypes_organism.json) self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir) print("Generating GTF") + # Instatiate self.gtf_generator = GTFGen(organism, self.release, wd, data_dir) print("Ensembl Finished !") def get_gtf(self): + + # Getter Method for resulting GTF-Entries as List. + # return_value: list of gtf entries. + return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization()) #e = Ensembl("homo_sapiens") diff --git a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py index 0d7bdc3..531fd18 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py @@ -4,25 +4,51 @@ class FTPEntry: """ - Class to determine if a ftp-path is file or directory. + + Class to determine if a ftp-path is file or directory. + Assigns every filename a parameter ('d' = directory or 'f' = file) + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, filename, ftpobj, startingdir=None): + + # Constructor + # input_parameter: filename = the files/directorys name + # ftpobj = Current Instance of FTPLib from URLRetrieve + # startingdir = optional parameter for ftp starting directory (to reduce browsing in ftp) + self.filename = filename if startingdir is None: startingdir = ftpobj.pwd() + + # Try if "filename" is directory + try: ftpobj.cwd(filename) self.filetype = 'd' ftpobj.cwd(startingdir) + + # If error_perm occurs filename is file not directory + except ftplib.error_perm: self.filetype = 'f' def gettype(self): + + # Getter method for filetype + return self.filetype def getfilename(self): + + # Getter method for filenname + return self.filename def __repr__(self): + + # Change the represenation scheme for FTPEntry object. + return self.filename, self.filetype diff --git a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py index 6b45598..ee4ce83 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -5,30 +5,58 @@ class FTPHandler: """ - Class to browse through ftp folders and download entries to local file + Class to browse through ftp folders and download entries to local file + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, url, wd): + + # Constructor + # input_parameters: wd = woking directory + # url = Url where to browse (in this case ftp.ensembl.org) + self.ftp = ftplib.FTP(url) self.ftp.login() self.ftp.cwd(wd) - def change_dir(self, wd): - self.ftp.cwd(wd) + def change_dir(self, to_dir): + + # Change ftp current working directory to parameter to_dir + + self.ftp.cwd(to_dir) def get_all_entries(self): + + # Get all ftp entries at current working directory + return self.ftp.nlst() def get_all_entries_from_dir(self, dire): + + # Helper method to get all entries in directory (dire) + # input_parameter: dire = directory where to get all entries + # return value: list of all entries + self.change_dir(dire) return self.get_all_entries() def get_all_entries_as_FTPEntry(self): - # Get All Files + + # Get All Files as FTPEntry + # returns list of FTPEntry objects, these objects are helper objects for easier determination if a + # FTP-Entry is file or Folder + files = self.ftp.nlst() return [FTPEntry(item, self.ftp, self.ftp.pwd()) for item in files] def save_entries_to_file(self, origin, target): + + # Method to save targeted entries to file + # input_parameter: origin = Origin FTP-Entries + # target = directory, where to save the files. + self.change_dir(origin) for file in self.get_all_entries_as_FTPEntry(): if file.gettype() == "f": diff --git a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py index d4c1ca6..7a8f066 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -5,11 +5,20 @@ class EnsemblRegulationFTPRetriever: """ - Class for checking current version locally and remote on ftp. - And downloading newest version if necessary + Class for checking current version locally and remote on ftp. + And downloading newest version if necessary + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, wd, data_dir): + + # Constructor: + # input_parameter: organism = input organism + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() self.localversion = self.get_current_local_version(wd, data_dir) @@ -19,9 +28,16 @@ def __init__(self, organism, wd, data_dir): print("Newest Version installed, no update needed.") def get_release(self): + + # Getter method for release version from FTP. + return self.remoteversion def get_current_ftp_version(self): + + # Gets the current ftp-version from ftp.ensembl.org + # return_value: string for current release on FTP + entries = self.site_ftp.get_all_entries() versionlist = [] for entry in entries: @@ -32,6 +48,11 @@ def get_current_ftp_version(self): return c_release def check_organism(self, organism, release, wd, data_dir): + + # Check if organism is locally existing + # input_parameter: as in __init__ + # return_value: Boolean if data locally exists or not + if data_dir: if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]: return False @@ -46,6 +67,11 @@ def check_organism(self, organism, release, wd, data_dir): return True def get_current_local_version(self, wd, data_dir): + + # Method to check for the current local version + # input_parameters: wd, data_dir as in __init__() + # return_value: String for local release_version or if not existing None + if data_dir: directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1] else: @@ -63,6 +89,10 @@ def get_current_local_version(self, wd, data_dir): def check_version_difference(self, organism, wd, data_dir): + # Method to check if local version is differing from remote version + # input_parameters: wd, data_dir, organism as in __init__() + # return_value: Boolean if the version is differing or not + local_version = self.localversion remote_version = self.remoteversion if local_version is None: @@ -81,6 +111,12 @@ def check_version_difference(self, organism, wd, data_dir): def download_currentversion_version(self, version, organism, wd, data_dir): + # Method to download current version from FTP if local version is not up-to-date + # input_parameters: version = version to download + # organism = input organism + # wd = working directory + # data_dir = data directory + # Download Base File if data_dir: targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism) @@ -108,4 +144,5 @@ def download_currentversion_version(self, version, organism, wd, data_dir): self.site_ftp.save_entries_to_file(link_origin, link_local) -#e = EnsemblRegulationFTPRetriever("mus_musculus") \ No newline at end of file +# Debug section +# e = EnsemblRegulationFTPRetriever("mus_musculus") \ No newline at end of file diff --git a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py index 98f88da..db37811 100644 --- a/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py @@ -1,24 +1,37 @@ import os import gzip -import csv class GTFGen: - """ Class to generate Ensembl GTF-data with activity + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, release, wd, data_dir): + # Constructor for GTFGen + # input_parameter: organism = input organism + # release = used Ensembl release + # wd = working directory (default is ".") + # data_dir = data directory (if specified this is used) + self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) + + # Map to assign numbers from Activitytable-binary to activity status + self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} def get_organism_as_gff(self, organism, release, wd, data_dir): # reads the original gff file for organism + # input_parameter as in __init__ described. + # return_value: list of gff-entries + if data_dir: directory = os.path.join(data_dir + "/EnsemblData/", release, organism) else: @@ -31,9 +44,12 @@ def get_organism_as_gff(self, organism, release, wd, data_dir): with gzip.open(inputfile) as original_file: return original_file.readlines() - def reformat_to_gff(self, activity, release): + def reformat_to_gtf(self, activity, release): # Reformats gff to gtf and appends activity-data for config specified celltype-categories + # input_parameter: activity = list of activity status for all genes + # release = current ensembl release + # return_value: List of gtf-formatted entries gtf_return = [] @@ -67,7 +83,12 @@ def reformat_to_gff(self, activity, release): @staticmethod def generate_additional_information(gene_id, activity): - # helper method to concat activity information to string + + # helper method to concat activity information to string and reformat from gff to gtf-style + # input_parameter: gene_id = gene_id formatted in gff format + # activity = List of activity-data for specified gene + # return_value: String for attributes (column 9) in gtf-format + if gene_id.startswith("ID=regulatory_region:"): gene_id = 'gene_id "'+gene_id.split(':')[1]+'"' elif gene_id.startswith("ID=E"): @@ -78,15 +99,24 @@ def generate_additional_information(gene_id, activity): return gene_id+'; '+activity_string def generate_activity_list(self, activity, index): - # generates activity list + + # generates activity list for a specified index + # input_parameter: index = index for a specified gene + # activity = List of activity-data for all entries + # return_value: List of activity for gene at index + activity_list = [] for key, value in activity.items(): activity_list.append(key+">"+self.value_map[value[index]]) return activity_list def get_gtf(self, release, activity): - # returns the resulting gtf-formatted-list - return self.reformat_to_gff(activity, release) + + # getter function for the resulting gtf-formatted-list + # input_parameters: release, activity as in self.reformat_to_gtf() + # return_value: List of GTF-Entries + + return self.reformat_to_gtf(activity, release) diff --git a/bin/3.1_create_gtf/Modules/Ensembl/checksums.py b/bin/3.1_create_gtf/Modules/Ensembl/checksums.py deleted file mode 100644 index 8f8b92a..0000000 --- a/bin/3.1_create_gtf/Modules/Ensembl/checksums.py +++ /dev/null @@ -1,23 +0,0 @@ -import hashlib - -# Python implementation of linux sum (BSD 16-bit Checksum) commandline tool. - -""" -Unused script with checksum implementations in Python -""" - -def bsdchecksum(infile): - with open(infile, 'rb') as f: - file_bytes = f.read() - c_sum = 0 - for char in file_bytes: - c_sum = (c_sum >> 1) + ((c_sum & 1) << 15) - c_sum += char - c_sum &= 0xffff - return c_sum - - -def md5_checksum(infile): - with open(infile, 'rb') as f: - file_bytes = f.read() - return hashlib.md5(file_bytes).hexdigest() diff --git a/bin/3.1_create_gtf/Modules/ucsc/ucsc.py b/bin/3.1_create_gtf/Modules/ucsc/ucsc.py index 66eb2ac..a4a315b 100644 --- a/bin/3.1_create_gtf/Modules/ucsc/ucsc.py +++ b/bin/3.1_create_gtf/Modules/ucsc/ucsc.py @@ -8,17 +8,32 @@ class UcscGtf: """ - Class to gather ucsc refSeq-FuncElem data. + + Class to gather data from UCSC Table Browsers, RefFuncGen Tracks. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de """ def __init__(self, org, wd, data_dir): + + # Constructor for UcscGtf + # input parameter: org = organism + # wd = working directory + # data_dir = data_directory (if defined this is used to save and get data) + self.organism_id = self.get_organism_id(org) + + # FTPlink to UCSC bigbed File + self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" + + # Where to save the output file if data_dir: self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") else: self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") + # Determine path to bigBedToBed binary. self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") print("Path to Bin: " + self.path_to_bin) @@ -28,12 +43,17 @@ def __init__(self, org, wd, data_dir): print("UCSC finished !") def generate_gff_file(self): + # Call bigBedToBed binary to get a Bed file in the UCSCData folder + callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): + # Reads Bed File and return a gtf-formatted list of elements. + # return_value: GTF-formatted List of regulation entries from UCSC + gtf_lines = [] with open(self.output, 'r') as csvfile: tsvreader = csv.reader(csvfile, delimiter='\t') @@ -53,7 +73,11 @@ def read_gff_to_gtf(self): return gtf_lines def find_ID(self, line): + # Find RefSeq ID in Line + # input_parameter: line = current line from bedfile + # return_value: string with gene_id in gtf-format + pattern = re.compile(r'ID:[0-9]{,9}|$') ref_id = re.search(pattern, line).group() splitted = ref_id.split(":") @@ -65,7 +89,11 @@ def find_ID(self, line): return returnstring def get_activity(self, line): + # Find activity categories in bed file + # input_parameter: line = current line from bedfile + # return_value: list with activity for specified line("keystatus") + key_status = [] for key, value in self.ucsc_categories.items(): if value: @@ -79,7 +107,11 @@ def get_activity(self, line): @staticmethod def get_organism_id(org): + # convert intern name e.g. "homo_sapiens" to ucsc name "hg38". + # input_parameter: org = organism parameter + # return_value: UCSC alias for this organism [ mm10 | hg38 ] + if org == "homo_sapiens": return "hg38" elif org == "mus_musculus": @@ -87,7 +119,12 @@ def get_organism_id(org): @staticmethod def get_activity_categories(organism, wd): + # Method to get ucsc-celltype categories from JSON config + # input_parameter: organism = organism parameter + # wd = working directory, to find config fil + # return_value: List of categories from config. + path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: @@ -98,4 +135,8 @@ def get_activity_categories(organism, wd): return categories def get_gtf(self): + + # Getter method for resulting gtf-lines + # return_value: List of gtf-formatted Strings (Lines) + return self.gtf_lines