loosolab · SebastianBeyvers · Jan 8, 2019 · Jan 3, 2019 · Jan 3, 2019 · Jan 3, 2019
diff --git a/bin/3.1_create_gtf/Modules/CrossMapper.py b/bin/3.1_create_gtf/Modules/CrossMapper.py
@@ -10,26 +10,49 @@ class CrossMapper:
     Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions.
     Utilizes CrossMap.py. see wiki for more information.
 
+    @author: Sebastian Beyvers
+    @contact: sebastian.beyvers@med.uni-giessen.de
+
+
     """
 
     def __init__(self, org, wd, out, is_dir):
-        self.org = org
+
+        # Constructor for CrossMapper class
+        # input_parameter: org    = input organism
+        #                  wd     = working directory
+        #                  out    = path to output-file -> Parameter
+        #                  is_dir = boolean if wd is data_dir or just working directory
+
+        # Get path to tempfile / outputfile and chain-file
+
         if is_dir:
-            self.infile = os.path.join( wd + "/temp/" + org + ".gtf")
+            self.infile = os.path.join(wd + "/temp/" + org + ".gtf")
         else:
             self.infile = os.path.join(wd+"/data/temp/"+org+".gtf")
-        self.outfile = os.path.join(out+"/" + org + "_mapped.gtf")
+        self.outfile = os.path.join(out)
         self.chainfile = self.get_chain_file(org, wd, is_dir)
-        # Execute Crossmapper for gff/gtf files
+
+        # Execute Crossmap for gff/gtf files
+
         (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile)
+
+        # Map results and save output to self.outfile
+
         CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile)
 
-    def get_chain_file(self, org, wd, isdir):
+    def get_chain_file(self, org, wd, is_data_dir):
 
         # Defines the Chain files for different conversions.
+        # input_parameter: org         = organism
+        #                  wd          = working directory
+        #                  is_data_dir = is wd data_dir or not
+
+        # return_value: Link to chain-file for conversion.
+        # Custom chain-files and chain-files for more organism can be specified in this section
 
         if org == "hg19":
-            if isdir:
+            if is_data_dir:
                 file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz")
             else:
                 file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )
@@ -39,7 +62,7 @@ def get_chain_file(self, org, wd, isdir):
             return file_link
 
         elif org == "mm9":
-            if isdir:
+            if is_data_dir:
                 file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz")
             else:
                 file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" )

diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py
@@ -4,8 +4,22 @@
 
 class ActivityCategorizer:
 
+    """
+
+        Class that categorizes activitydata based on json config and binary activitydata (table.bin).
+        @author: Sebastian Beyvers
+        @contact: sebastian.beyvers@med.uni-giessen.de
+
+     """
+
     def __init__(self, release, organism, wd, data_dir):
 
+        # Constructor for ActivityCategorizer
+        # input_parameter: organism        = input organism
+        #                  release         = current used Ensembl release
+        #                  wd              = working dir (default working directory, data_dir is used if specified)
+        #                  data_dir        = data directory (this is used as directory if specified)
+
         # List of all Folders with Activity Tables
 
         self.folderlist = []
@@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir):
         print("Categorization finished !")
 
     def get_categorization(self):
+
+        # Getter method to return the self.categorization variable
+
         return self.categorization
 
     def read_config(self, organism, wd):
 
+        # Method to read the celltypes_organism.json config file
+        # input_parameter: organism = input organism
+        #                        wd = working directory to find the config files.
+        # return_value: Dictionary with ensembl aliases based on config
+        # -> Key = type (from config), value = list of ensembl aliases
+
         c_dict = {}
         path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json")
         with open(path_to_config) as input_file:
@@ -43,6 +66,13 @@ def read_config(self, organism, wd):
 
     def get_activity_data(self, release, organism, wd, data_dir):
 
+        # Method to read the binary table.bin file and return its content as bytearray
+        # input_parameter: organism        = input organism
+        #                  release         = current used Ensembl release
+        #                  wd              = working dir (default working directory, data_dir is used if specified)
+        #                  data_dir        = data directory (this is used as directory if specified)
+        # return_value: bytearray with activitystatus
+
         for folder in self.folderlist:
             # Generate path to binary File
             if data_dir:
@@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir):
                 self.activity[folder] = bytearray(tables.read())
 
     def generate_categorized_activity(self):
+
+        # Categorizes the Activity by config defined categories.
+
         category_activity = {}
 
         for category, aliases in self.c_dict.items():
@@ -80,10 +113,16 @@ def generate_categorized_activity(self):
 
     def activity_comparator(self, aliaslist):
 
+        # Method to determine the resulting activitystatus if the entry contains
+        # multiple differing activitystatus from aliases
+        # e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info
+        # input_parameter: aliaslist = list of aliases for activity_data
+        # return_value: Array of Activitystatus by category (type in config)
+
         concatenated_array = bytearray([])
 
         length = len(self.activity[aliaslist[0]])
-        input_arrays = [self.activity[x] for x in aliaslist]
+        input_arrays = [self.activity[index] for index in aliaslist]
         for x in range(length):
             if any(y[x] == 0 for y in input_arrays):
                 concatenated_array.append(0)
@@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist):
 # e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens")
 # print(len(e.categorization))
 # for x in e.categorization.values():
-#     print(len(x))
+#     print(len(x))
diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py
@@ -8,37 +8,64 @@ class ActivityTable:
     Class for checking activity_table and generating them.
     activityTable = byte Representation of activity status
     corresponding to the generator schema default:
+
     0, "activity=ACTIVE",
     1, "activity=POISED",
     2, "activity=REPRESSED",
     3, "activity=INACTIVE",
     4, "activity=NA"
+
+    @author: Sebastian Beyvers
+    @contact: sebastian.beyvers@med.uni-giessen.de
+
     """
 
     def __init__(self, organism, current_release, wd, data_dir):
+
+        # Constructor for the ActivityTable-Class
+        # input_parameter: organism        = input organism
+        #                  current_release = current used Ensembl release
+        #                  wd              = working dir (default working directory, data_dir is used if specified)
+        #                  data_dir        = data directory (this is used as directory if specified)
+
         if data_dir:
             self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity")
         else:
             self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity")
         self.folders = next(os.walk(self.link))[1]
+
+        # List to represent Index with activitystatus for ATGenerator class
+
         self.generator = ATGenerator(["activity=ACTIVE",
                                "activity=POISED",
                                "activity=REPRESSED",
                                "activity=INACTIVE",
                                "activity=NA"])
 
     def check_and_generate_activity_table(self):
-        # checks if file already exists and generates new one if missing
+
+        # checks if file (table.bin) already exists for celltype -> generates new one if missing
+
         for subfolder in self.folders:
             folder_link = os.path.join(self.link, subfolder)
             sf_link = os.path.join(folder_link, "table.bin")
+
+            # If table.bin is missing:
+
             if not os.path.isfile(sf_link):
                 print("No ActivityTable for "+subfolder+" found, generating new one.")
                 self.generate_table(folder_link)
+
+            # Else: Do nothing
+
         print("All ActivityTables found, proceeding")
 
     def generate_table(self, link):
+
         # generates the table and saves it as table.bin file
+        # input_parameter: link = link to ensembl activity folder for specific celltype
+        # generates table.bin file in link folder
+
         for root, dirs, files in os.walk(link):
             for file in files:
                 if file.endswith(".gff.gz"):
@@ -47,7 +74,8 @@ def generate_table(self, link):
                     with open(file_path, "wb") as f:
                         f.write(self.generator.read_table(originpath))
                     print("New ActivityTable generated in: " + root)
-# Debug
 
-#e = ActivityTable("homo_sapiens", "release-94")
-#e.check_and_generate_activity_table()
+
+# Debug
+# e = ActivityTable("homo_sapiens", "release-94")
+# e.check_and_generate_activity_table()
diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py
@@ -4,14 +4,26 @@
 class ATGenerator:
 
     """
-    Reads saved activity binary files (table.bin)
+
+    Reads gzip files for activitydata and generates a bytearray with activitystatus.
+    @author: Sebastian Beyvers
+    @contact: sebastian.beyvers@med.uni-giessen.de
+
     """
 
     def __init__(self, repre):
 
+        # Constructor with parameter representation: List of keywords with a corresponding index
+        # Only the Index as byte will be appended to a bytearray.
+
         self.representation = repre
 
     def read_table(self, file):
+
+        # Reads the activity-ensembl file ("/data/EnsemblData/release/organism/activity/*")
+        # and returns the activity as bytearray, based on the representation in self.representation.
+        # Only the index is saved -> see ActivityTable.py for current representation.
+
         activity_table = []
         with gzip.open(file, 'rb') as f:
             for line in f:

diff --git a/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py b/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py
@@ -7,23 +7,41 @@
 class Ensembl:
 
     """
+
     Main class for handling Ensembl Regulatory data
     Checks for local files and downloads if files are missing
+    @author: Sebastian Beyvers
+    @contact: sebastian.beyvers@med.uni-giessen.de
+
     """
 
     def __init__(self, organism, wd, data_dir):
+
+        # Constructor and main method for Ensembl-GTF-Creation
+        # input_parameter: organism    = input organism
+        #                  wd          = working directory
+        #                  data_dir    = use data_dir parameter if specified.
+
         print("Starting Ensembl")
+        # Check and Update for Local Ensembl Release Data
         self.updater = FTPRetriever(organism, wd, data_dir)
         self.release = self.updater.get_release()
+        # Check for Activitytables (table.bin binary files) and generate if not existing
         self.acttable = ActivityTable(organism, self.release, wd, data_dir)
         self.acttable.check_and_generate_activity_table()
+        # Categorize the Activitytable by config defined categories (config: ./config/celltypes_organism.json)
         self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir)
         print("Generating GTF")
+        # Instatiate
         self.gtf_generator = GTFGen(organism, self.release, wd, data_dir)
 
         print("Ensembl Finished !")
 
     def get_gtf(self):
+
+        # Getter Method for resulting GTF-Entries as List.
+        # return_value: list of gtf entries.
+
         return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization())
 
 #e = Ensembl("homo_sapiens")

diff --git a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py
@@ -4,25 +4,51 @@
 class FTPEntry:
 
     """
-    Class to determine if a ftp-path is file or directory.
+
+        Class to determine if a ftp-path is file or directory.
+        Assigns every filename a parameter ('d' = directory or 'f' = file)
+        @author: Sebastian Beyvers
+        @contact: sebastian.beyvers@med.uni-giessen.de
+
     """
 
     def __init__(self, filename, ftpobj, startingdir=None):
+
+        # Constructor
+        # input_parameter: filename    = the files/directorys name
+        #                  ftpobj      = Current Instance of FTPLib from URLRetrieve
+        #                  startingdir = optional parameter for ftp starting directory (to reduce browsing in ftp)
+
         self.filename = filename
         if startingdir is None:
             startingdir = ftpobj.pwd()
+
+        # Try if "filename" is directory
+
         try:
             ftpobj.cwd(filename)
             self.filetype = 'd'
             ftpobj.cwd(startingdir)
+
+        # If error_perm occurs filename is file not directory
+
         except ftplib.error_perm:
             self.filetype = 'f'
 
     def gettype(self):
+
+        # Getter method for filetype
+
         return self.filetype
 
     def getfilename(self):
+
+        # Getter method for filenname
+
         return self.filename
 
     def __repr__(self):
+
+        # Change the represenation scheme for FTPEntry object.
+
         return self.filename, self.filetype