diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f1b02e --- /dev/null +++ b/.gitignore @@ -0,0 +1,171 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +/.idea/ +/EnsemblData/release-94/ +/UCSCData/hg38.bed +/UCSCData/mm10.bed +/results/homo_sapiens_filtered.gtf diff --git a/Modules/Ensembl/ActivityCategorizer.py b/Modules/Ensembl/ActivityCategorizer.py new file mode 100644 index 0000000..8fee867 --- /dev/null +++ b/Modules/Ensembl/ActivityCategorizer.py @@ -0,0 +1,103 @@ +import json +import os + + +class ActivityCategorizer: + + def __init__(self, release, organism): + + # List of all Folders with Activity Tables + + self.folderlist = [] + + # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner] + + self.c_dict = self.read_config(organism) + + # Activity table from all files as dict + + self.activity = {} + + self.get_activity_data(release, organism) + + # Categorized Activity from Json-config + print("Categorization: This may take a while") + self.categorization = self.generate_categorized_activity() + + print("Categorization finished !") + + def get_categorization(self): + return self.categorization + + def read_config(self, organism): + + c_dict = {} + path_to_config = os.path.join("./config/celltypes_"+organism+".json") + with open(path_to_config) as input: + data = json.loads(input.read()) + for x in data: + c_dict[x["type"]] = x["alias_ensembl"] + self.folderlist.extend(x["alias_ensembl"]) + + return c_dict + + def get_activity_data(self, release, organism): + + for folder in self.folderlist: + # Generate path to binary File + file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin") + with open(file, "rb") as tables: + self.activity[folder] = bytearray(tables.read()) + + def generate_categorized_activity(self): + category_activity = {} + + for category, aliases in self.c_dict.items(): + + # If an alias exists + + if aliases: + + # If theres only one alias + + if len(aliases) == 1: + category_activity[category] = self.activity[aliases[0]] + + # If there are multiple alias + + else: + category_activity[category] = self.activity_comparator(aliases) + + # If theres no alias all bytes were set to 4 = NA + + else: + category_activity[category] = bytearray([4]*len(self.activity[self.folderlist[0]])) + + return category_activity + + def activity_comparator(self, aliaslist): + + concatenated_array = bytearray([]) + + length = len(self.activity[aliaslist[0]]) + input_arrays = [self.activity[x] for x in aliaslist] + for x in range(length): + if any(y[x] == 0 for y in input_arrays): + concatenated_array.append(0) + elif any(y[x] == 1 for y in input_arrays): + concatenated_array.append(1) + elif any(y[x] == 2 for y in input_arrays): + concatenated_array.append(2) + elif any(y[x] == 3 for y in input_arrays): + concatenated_array.append(3) + elif any(y[x] == 4 for y in input_arrays): + concatenated_array.append(4) + return concatenated_array + + +# Debugging + +# e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens") +# print(len(e.categorization)) +# for x in e.categorization.values(): +# print(len(x)) \ No newline at end of file diff --git a/Modules/Ensembl/ActivityTable.py b/Modules/Ensembl/ActivityTable.py new file mode 100644 index 0000000..042421a --- /dev/null +++ b/Modules/Ensembl/ActivityTable.py @@ -0,0 +1,48 @@ +import os.path +from Modules.Ensembl.ActivityTableGenerator import ATGenerator + + +class ActivityTable: + + """ + Class for checking activity_table and generating them. + ActivityTable = Byte Representation of Activity Status + corresponding to the generator Schema default: + 0, "activity=ACTIVE", + 1, "activity=POISED", + 2, "activity=REPRESSED", + 3, "activity=INACTIVE", + 4, "activity=NA" + """ + + def __init__(self, organism, current_release): + self.link = os.path.join("./EnsemblData/", current_release, organism, "activity") + self.folders = next(os.walk(self.link))[1] + self.generator = ATGenerator(["activity=ACTIVE", + "activity=POISED", + "activity=REPRESSED", + "activity=INACTIVE", + "activity=NA"]) + + def check_and_generate_activity_table(self): + for subfolder in self.folders: + folder_link = os.path.join(self.link, subfolder) + sf_link = os.path.join(folder_link, "table.bin") + if not os.path.isfile(sf_link): + print("No ActivityTable for "+subfolder+" found, generating new one.") + self.generate_table(folder_link) + print("All ActivityTables found, proceeding") + + def generate_table(self, link): + for root, dirs, files in os.walk(link): + for file in files: + if file.endswith(".gff.gz"): + originpath = os.path.join(root, file) + file_path = os.path.join(root, "table.bin") + with open(file_path, "wb") as f: + f.write(self.generator.read_table(originpath)) + print("New ActivityTable generated in: " + root) +# Debug + +#e = ActivityTable("homo_sapiens", "release-94") +#e.check_and_generate_activity_table() \ No newline at end of file diff --git a/Modules/Ensembl/ActivityTableGenerator.py b/Modules/Ensembl/ActivityTableGenerator.py new file mode 100644 index 0000000..2c2cff6 --- /dev/null +++ b/Modules/Ensembl/ActivityTableGenerator.py @@ -0,0 +1,21 @@ +import gzip + + +class ATGenerator: + + def __init__(self, repre): + + self.representation = repre + + def read_table(self, file): + activity_table = [] + with gzip.open(file, 'rb') as f: + for line in f: + for index, re in enumerate(self.representation): + if re in str(line): + activity_table.append(index) + break + return bytearray(activity_table) + + + diff --git a/Modules/Ensembl/Ensembl.py b/Modules/Ensembl/Ensembl.py new file mode 100644 index 0000000..df9cbc0 --- /dev/null +++ b/Modules/Ensembl/Ensembl.py @@ -0,0 +1,25 @@ +from Modules.Ensembl.ActivityTable import ActivityTable +from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever +from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer +from Modules.Ensembl.GTFGen import GTFGen + + +class Ensembl: + + def __init__(self, organism): + print("Starting Ensembl") + self.updater = FTPRetriever(organism) + self.release = self.updater.get_release() + self.acttable = ActivityTable(organism, self.release) + self.acttable.check_and_generate_activity_table() + self.categorizer = ActivityCategorizer(self.release, organism) + print("Generating GTF") + self.gtf_generator = GTFGen(organism, self.release) + + print("Ensembl Finished !") + + def get_gtf(self): + return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization()) + +#e = Ensembl("homo_sapiens") +#print(len(e.categorizer.categorization)) \ No newline at end of file diff --git a/Modules/Ensembl/FTPHandling/FTPEntry.py b/Modules/Ensembl/FTPHandling/FTPEntry.py new file mode 100644 index 0000000..0a90080 --- /dev/null +++ b/Modules/Ensembl/FTPHandling/FTPEntry.py @@ -0,0 +1,27 @@ +import ftplib + + +# Class to determine if a Ftp-path is file or directory. + + +class FTPEntry: + + def __init__(self, filename, ftpobj, startingdir=None): + self.filename = filename + if startingdir is None: + startingdir = ftpobj.pwd() + try: + ftpobj.cwd(filename) + self.filetype = 'd' + ftpobj.cwd(startingdir) + except ftplib.error_perm: + self.filetype = 'f' + + def gettype(self): + return self.filetype + + def getfilename(self): + return self.filename + + def __repr__(self): + return self.filename, self.filetype diff --git a/Modules/Ensembl/FTPHandling/URLRetrieve.py b/Modules/Ensembl/FTPHandling/URLRetrieve.py new file mode 100644 index 0000000..598a6f5 --- /dev/null +++ b/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -0,0 +1,36 @@ +import ftplib +from Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry + + +class FTPHandler: + + def __init__(self, url, wd): + self.ftp = ftplib.FTP(url) + self.ftp.login() + self.ftp.cwd(wd) + + def change_dir(self, wd): + self.ftp.cwd(wd) + + def get_all_entries(self): + return self.ftp.nlst() + + def get_all_entries_from_dir(self, dir): + self.change_dir(dir) + return self.get_all_entries() + + def get_all_entries_as_FTPEntry(self): + # Get All Files + files = self.ftp.nlst() + return [FTPEntry(item, self.ftp, self.ftp.pwd()) for item in files] + + def save_entries_to_file(self, origin, target): + self.change_dir(origin) + for file in self.get_all_entries_as_FTPEntry(): + if file.gettype() == "f": + # Download only Checksum & gff.gz files + if file.getfilename() not in ["README", "manifest.tsv"]: + print("Downloading....... > " + file.getfilename()) + self.ftp.retrbinary("RETR " + file.getfilename(), open(target + "/" + file.getfilename(), 'wb').write) + + diff --git a/Modules/Ensembl/FTPHandling/VersionChecker.py b/Modules/Ensembl/FTPHandling/VersionChecker.py new file mode 100644 index 0000000..874e776 --- /dev/null +++ b/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -0,0 +1,99 @@ +from Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler +import os.path + + +class EnsemblRegulationFTPRetriever: + + """ + Class for checking current version locally and remote on ftp. + And downloading newest version if necessary + """ + + def __init__(self, organism): + self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") + self.remoteversion = self.get_current_ftp_version() + self.localversion = self.get_current_local_version() + if self.check_version_difference(organism): + self.download_currentversion_version(self.remoteversion, organism) + else: + print("Newest Version installed, no update needed.") + + def get_release(self): + return self.remoteversion + + def get_current_ftp_version(self): + entries = self.site_ftp.get_all_entries() + versionlist = [] + for entry in entries: + if "release" in entry: + versionlist.append(entry) + c_release = sorted(versionlist, reverse=True)[0] + print("Current release is "+c_release) + return c_release + + def check_organism(self, organism, release): + if organism in next(os.walk("./EnsemblData/"+release+"/"))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True + + def get_current_local_version(self): + directories = next(os.walk("./EnsemblData/"))[1] + for dir in directories: + if "release" in dir: + localversion = sorted(directories, reverse=True)[0] + print("Local Version found: " + localversion) + return localversion + else: + print("No Version installed !") + return None + print("No Version installed !") + return None + + def check_version_difference(self, organism): + + local_version = self.localversion + remote_version = self.remoteversion + if local_version is None: + return True + remote_nr = int(remote_version.split("-")[1]) + local_nr = int(local_version.split("-")[1]) + + if remote_nr > local_nr: + print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version) + return True + else: + if self.check_organism(organism, local_version): + return True + else: + return False + + def download_currentversion_version(self, version, organism): + + # Download Base File + + targetfolder = os.path.join("./EnsemblData/", version, organism) + os.makedirs(targetfolder) + folder_url = "/pub/"+version+"/regulation/"+organism+"/" + self.site_ftp.change_dir(folder_url) + self.site_ftp.save_entries_to_file(folder_url, targetfolder) + + # Download Regulation Activity + + activityfolder_local = os.path.join(targetfolder, "activity") # local Folder for Activity Data + activityfolder_remote = folder_url+"RegulatoryFeatureActivity/" # remote (ftp) folder for activity data + os.mkdir(activityfolder_local) # Create New Folder + + celltypes_list = self.site_ftp.get_all_entries_from_dir(activityfolder_remote) # Get List for all entries in activity Folder + + # Iterate over Celltype List and Download in corresponding subfolder + + for celltype in celltypes_list: + link_local = os.path.join(activityfolder_local, celltype) + link_origin = activityfolder_remote+"/"+celltype + os.mkdir(link_local) + self.site_ftp.save_entries_to_file(link_origin, link_local) + + +#e = EnsemblRegulationFTPRetriever("mus_musculus") \ No newline at end of file diff --git a/Modules/Ensembl/FTPHandling/__init__.py b/Modules/Ensembl/FTPHandling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Modules/Ensembl/GTFGen.py b/Modules/Ensembl/GTFGen.py new file mode 100644 index 0000000..a4de58a --- /dev/null +++ b/Modules/Ensembl/GTFGen.py @@ -0,0 +1,71 @@ +import os +import gzip +import csv + + +class GTFGen: + + def __init__(self, organism, release): + + self.gff_lines = self.get_organism_as_gff(organism, release) + self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} + + def get_organism_as_gff(self, organism, release): + + directory = os.path.join("./EnsemblData/", release, organism) + inputfile = "" + for file in os.listdir(directory): + if file.endswith("gff.gz"): + inputfile = os.path.join(directory, file) + + with gzip.open(inputfile) as original_file: + return original_file.readlines() + + def reformat_to_gff(self, activity, release): + + gtf_return = [] + + for index, line in enumerate(self.gff_lines): + + decoded_line = line.decode("UTF-8") + # Generate a templist + templist = [] + # Split Line by Tab + splitted = decoded_line.split("\t") + # Split Last Field by ";" + splitted_additional = splitted[-1].strip().split(";") + # Add Chromosome Name Format = chr+Name + templist.append("chr"+splitted[0]) + # Add RegBuild_ + release + templist.append("RegBuild_"+release) + # Add Description from Description in last ; separated segment + templist.append(splitted_additional[4].split("=")[1]) + # Add Start / End Data from original + templist.extend(splitted[3:5]) + # Add Score, Strand and Frame Data + templist.extend([".", "+", "."]) + # Add "additional" information + + templist.append(self.generate_additional_information(splitted_additional[0], + self.generate_activity_list(activity, index))) + + gtf_return.append(templist) + + return gtf_return + + @staticmethod + def generate_additional_information(id, activity): + return "; ".join([id, "activity="+", ".join(activity)]) + + def generate_activity_list(self, activity, index): + activity_list = [] + for key, value in activity.items(): + activity_list.append(key+">"+self.value_map[value[index]]) + return activity_list + + def get_gtf(self, release, activity): + + return self.reformat_to_gff(activity, release) + + + diff --git a/Modules/Ensembl/__init__.py b/Modules/Ensembl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Modules/Ensembl/checksums.py b/Modules/Ensembl/checksums.py new file mode 100644 index 0000000..55209f9 --- /dev/null +++ b/Modules/Ensembl/checksums.py @@ -0,0 +1,27 @@ +import hashlib + +# Python implementation of linux sum (BSD 16-bit Checksum) commandline tool. + +def bsdchecksum(infile): + with open(infile, 'rb') as f: + file_bytes = f.read() + c_sum = 0 + for char in file_bytes: + c_sum = (c_sum >> 1) + ((c_sum & 1) << 15) + c_sum += char + c_sum &= 0xffff + return c_sum + + +def md5_checksum(infile): + with open(infile, 'rb') as f: + file_bytes = f.read() + return hashlib.md5(file_bytes).hexdigest() + + +if __name__ == '__main__': + # print(bsdchecksum("/home/basti/Schreibtisch/tests/homo_sapiens.GRCh38.HMEC.Regulatory_Build.regulatory_activity.20161111.gff.gz")) + print(md5_checksum("/home/basti/Schreibtisch/tests/" + "mus_musculus.GRCm38.forebrain_embryonic_16_5_days." + "Regulatory_Build.regulatory_activity.20180516.gff.gz")) + diff --git a/Modules/SaveResults.py b/Modules/SaveResults.py new file mode 100644 index 0000000..2dddda7 --- /dev/null +++ b/Modules/SaveResults.py @@ -0,0 +1,19 @@ +import csv +import os + + +class ResultSaver: + + def __init__(self, results, organism, tissue): + + print("Save results to File !") + self.path = "" + if tissue: + self.path = os.path.join("./results/"+organism+"_filtered.gtf") + else: + self.path = os.path.join("./results/"+organism+".gtf") + + with open(self.path, "w") as file: + write_it = csv.writer(file, delimiter='\t') + for line in results: + write_it.writerow(line) diff --git a/Modules/Uniquifier.py b/Modules/Uniquifier.py new file mode 100644 index 0000000..7d4925d --- /dev/null +++ b/Modules/Uniquifier.py @@ -0,0 +1,34 @@ +class UniqueFilter: + + def __init__(self, ense, ucsc, org_filter=None): + self.results = self.get_filtered_results(org_filter, ense, ucsc) + + def get_results(self): + return self.results + + def get_filtered_results(self, org_filter, ense, ucsc): + unfiltered_results = self.concat_without_duplicates(ense, ucsc) + if org_filter: + filterstrings = [x+">ACTIVE" for x in org_filter] + return_list = [] + for element in unfiltered_results: + if any(tissue in element[-1] for tissue in filterstrings): + return_list.append(element) + return return_list + else: + return unfiltered_results + + + @staticmethod + def concat_without_duplicates(ense, ucsc): + results = ense+ucsc + for ens in ense: + for uc in ucsc: + if ens[0] == uc[0]: + #print("Chromosome Equal") + if ense[3] == uc[3]: + results.remove(ucsc) + + return results + + diff --git a/Modules/__init__.py b/Modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Modules/ucsc/__init__.py b/Modules/ucsc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Modules/ucsc/bigBedToBed b/Modules/ucsc/bigBedToBed new file mode 100755 index 0000000..86cd062 Binary files /dev/null and b/Modules/ucsc/bigBedToBed differ diff --git a/Modules/ucsc/ucsc.py b/Modules/ucsc/ucsc.py new file mode 100644 index 0000000..4fbad78 --- /dev/null +++ b/Modules/ucsc/ucsc.py @@ -0,0 +1,90 @@ +import subprocess +import csv +import os +import json +import re + + +class UcscGtf: + + def __init__(self, org): + self.organism_id = self.get_organism_id(org) + self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" + self.output = "./UCSCData/"+self.organism_id+".bed" + print("Getting UCSC Data") + self.generate_gff_file() + self.ucsc_categories = self.get_activity_categories(org) + self.gtf_lines = self.read_gff_to_gtf() + print("UCSC finished !") + + def generate_gff_file(self): + callstring = ["./Modules/ucsc/bigBedToBed", self.link, self.output] + subprocess.call(callstring) + + def read_gff_to_gtf(self): + gtf_lines = [] + with open(self.output, 'r') as csvfile: + tsvreader = csv.reader(csvfile, delimiter='\t') + for row in tsvreader: + sequence = [] + sequence.append(row[0]) + sequence.append("UCSC") + sequence.append(row[3]) + sequence.append(row[1]) + sequence.append(row[2]) + sequence.append(".") + sequence.append(row[5]) + sequence.append(".") + sequence.append("; ".join([self.find_ID("".join(row[11:])), ", ".join(self.get_activity(".".join(row[11:])))])) + gtf_lines.append(sequence) + + return gtf_lines + + def find_ID(self, line): + pattern = re.compile(r'ID:[0-9]{,9}|$') + + return re.search( pattern, line).group() + + def get_activity(self, line): + key_status = [] + for key, value in self.ucsc_categories.items(): + if value: + if any([line.find(keyword) != -1 for keyword in value]): + key_status.append(key+">ACTIVE") + else: + key_status.append(key + ">NA") + else: + key_status.append(key + ">NA") + return key_status + + @staticmethod + def get_organism_id(org): + if org == "homo_sapiens": + return "hg38" + elif org == "mus_musculus": + return "mm10" + + @staticmethod + def get_activity_categories(organism): + path_to_config = os.path.join("./config/celltypes_" + organism + ".json") + categories = {} + with open(path_to_config) as input_file: + data = json.loads(input_file.read()) + for x in data: + categories[x["type"]] = x["alias_ucsc"] + + return categories + + def get_gtf(self): + return self.gtf_lines + + def test_save_to_file(self): + + with open("./results/test.gtf", "w") as file: + write_it = csv.writer(file, delimiter='\t') + for line in self.gtf_lines: + write_it.writerow(line) + + +#u = UcscGtf("homo_sapiens") +#u.test_save_to_file() \ No newline at end of file diff --git a/README.md b/README.md index bcf6309..4cf5660 100644 --- a/README.md +++ b/README.md @@ -25,3 +25,4 @@ nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [ ## Parameter For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) + diff --git a/RegGTFExtractor.py b/RegGTFExtractor.py new file mode 100755 index 0000000..294e539 --- /dev/null +++ b/RegGTFExtractor.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import argparse +from Modules.Ensembl.Ensembl import Ensembl +from Modules.ucsc.ucsc import UcscGtf +from Modules.Uniquifier import UniqueFilter +from Modules.SaveResults import ResultSaver +import os + + +def check_for_local_folder(): + + if not os.path.isdir("./EnsemblData"): + + os.mkdir("./EnsemblData") + + if not os.path.isdir( "./UCSCData" ): + os.mkdir( "./UCSCData" ) + + +def main_script(org, tissuetype=None): + + check_for_local_folder() + ucsc = UcscGtf(org) + ense = Ensembl(org) + print("Getting Unique Results") + unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) + ResultSaver(unique_filter.get_results(), org, tissuetype) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) + parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) + parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) + args = vars(parser.parse_args()) + if args["organism"]: + #print(args["tissue"]) + main_script(args["organism"], args["tissue"]) + else: + print("No Arguments found -> See ./RegGTFExtractor.py -h for help.") + diff --git a/config/celltypes_homo_sapiens.json b/config/celltypes_homo_sapiens.json new file mode 100644 index 0000000..0f02cde --- /dev/null +++ b/config/celltypes_homo_sapiens.json @@ -0,0 +1,179 @@ +[ + { + "type": "A549", + "alias_ucsc": [], + "alias_ensembl": ["A549"] + }, + { + "type": "Aorta", + "alias_ucsc": ["branchial arch"], + "alias_ensembl": ["Aorta"] + }, + { + "type": "Thymus", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Thymus", "Thymus"] + }, + { + "type": "B-Cells", + "alias_ucsc": [], + "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] + }, + { + "type": "T-Cell", + "alias_ucsc": [], + "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] + }, + { + "type": "Monocyte", + "alias_ucsc": [], + "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] + }, + { + "type": "Neutrophil", + "alias_ucsc": [], + "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] + }, + { + "type": "Eosinophil", + "alias_ucsc": [], + "alias_ensembl": ["eosinophil_VB"] + }, + { + "type": "Macrophage", + "alias_ucsc": [], + "alias_ensembl": [ + "M0_macrophage_CB", + "M0_macrophage_VB", + "M1_macrophage_CB", + "M1_macrophage_VB", + "M2_macrophage_CB", + "M2_macrophage_VB" + ] + }, + { + "type": "Erythroblast", + "alias_ucsc": [], + "alias_ensembl": ["erythroblast_CB"] + }, + { + "type": "Intestine", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] + }, + { + "type": "AdrenalGland", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Adrenal_Gland"] + }, + { + "type": "Muscle", + "alias_ucsc": ["limb"], + "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] + }, + { + "type": "Gastric", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Stomach", "Gastric"] + }, + { + "type": "Endothelial", + "alias_ucsc": ["blood vessels"], + "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] + }, + { + "type": "StemCells", + "alias_ucsc": [], + "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] + }, + { + "type": "Lung", + "alias_ucsc": [], + "alias_ensembl": ["Lung", "IMR90", "NHLF"] + }, + { + "type": "Pancreas", + "alias_ucsc": ["pancreas"], + "alias_ensembl": ["Pancreas"] + }, + { + "type": "Liver", + "alias_ucsc": ["liver"], + "alias_ensembl": [] + }, + { + "type": "Ovary", + "alias_ucsc": [], + "alias_ensembl": ["Ovary"] + }, + { + "type": "Placenta", + "alias_ucsc": [], + "alias_ensembl": ["Placenta"] + }, + { + "type": "Spleen", + "alias_ucsc": [], + "alias_ensembl": ["Spleen"] + }, + { + "type": "Heart", + "alias_ucsc": ["heart"], + "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] + }, + { + "type": "Osteoblast", + "alias_ucsc": [], + "alias_ensembl": ["Osteobl"] + }, + { + "type": "Fibroblast", + "alias_ucsc": [], + "alias_ensembl": ["NHDF_AD"] + }, + { + "type": "NK-Cells", + "alias_ucsc": [], + "alias_ensembl": ["Natural_Killer_cells_PB"] + }, + { + "type": "Cancers", + "alias_ucsc": [], + "alias_ensembl": ["HeLa_S3", "HepG2", "DND_41", "K562"] + }, + { + "type": "Brain", + "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], + "alias_ensembl": ["NH_A"] + }, + { + "type": "Mesenchym", + "alias_ucsc": ["mesenchyme derived from neural crest", "facial mesenchyme"], + "alias_ensembl": [] + }, + { + "type": "Embryonal", + "alias_ucsc": ["somite", "genital tubercle"], + "alias_ensembl": [] + }, + { + "type": "Eye", + "alias_ucsc": ["eye"], + "alias_ensembl": [] + }, + { + "type": "Nose", + "alias_ucsc": ["nose"], + "alias_ensembl": [] + }, + { + "type": "Tail", + "alias_ucsc": ["tail"], + "alias_ensembl": [] + }, + { + "type": "Melanocytes", + "alias_ucsc": ["melanocytes"], + "alias_ensembl": [] + } +] diff --git a/config/celltypes_mus_musculus.json b/config/celltypes_mus_musculus.json new file mode 100644 index 0000000..0f02cde --- /dev/null +++ b/config/celltypes_mus_musculus.json @@ -0,0 +1,179 @@ +[ + { + "type": "A549", + "alias_ucsc": [], + "alias_ensembl": ["A549"] + }, + { + "type": "Aorta", + "alias_ucsc": ["branchial arch"], + "alias_ensembl": ["Aorta"] + }, + { + "type": "Thymus", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Thymus", "Thymus"] + }, + { + "type": "B-Cells", + "alias_ucsc": [], + "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] + }, + { + "type": "T-Cell", + "alias_ucsc": [], + "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] + }, + { + "type": "Monocyte", + "alias_ucsc": [], + "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] + }, + { + "type": "Neutrophil", + "alias_ucsc": [], + "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] + }, + { + "type": "Eosinophil", + "alias_ucsc": [], + "alias_ensembl": ["eosinophil_VB"] + }, + { + "type": "Macrophage", + "alias_ucsc": [], + "alias_ensembl": [ + "M0_macrophage_CB", + "M0_macrophage_VB", + "M1_macrophage_CB", + "M1_macrophage_VB", + "M2_macrophage_CB", + "M2_macrophage_VB" + ] + }, + { + "type": "Erythroblast", + "alias_ucsc": [], + "alias_ensembl": ["erythroblast_CB"] + }, + { + "type": "Intestine", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] + }, + { + "type": "AdrenalGland", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Adrenal_Gland"] + }, + { + "type": "Muscle", + "alias_ucsc": ["limb"], + "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] + }, + { + "type": "Gastric", + "alias_ucsc": [], + "alias_ensembl": ["Fetal_Stomach", "Gastric"] + }, + { + "type": "Endothelial", + "alias_ucsc": ["blood vessels"], + "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] + }, + { + "type": "StemCells", + "alias_ucsc": [], + "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] + }, + { + "type": "Lung", + "alias_ucsc": [], + "alias_ensembl": ["Lung", "IMR90", "NHLF"] + }, + { + "type": "Pancreas", + "alias_ucsc": ["pancreas"], + "alias_ensembl": ["Pancreas"] + }, + { + "type": "Liver", + "alias_ucsc": ["liver"], + "alias_ensembl": [] + }, + { + "type": "Ovary", + "alias_ucsc": [], + "alias_ensembl": ["Ovary"] + }, + { + "type": "Placenta", + "alias_ucsc": [], + "alias_ensembl": ["Placenta"] + }, + { + "type": "Spleen", + "alias_ucsc": [], + "alias_ensembl": ["Spleen"] + }, + { + "type": "Heart", + "alias_ucsc": ["heart"], + "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] + }, + { + "type": "Osteoblast", + "alias_ucsc": [], + "alias_ensembl": ["Osteobl"] + }, + { + "type": "Fibroblast", + "alias_ucsc": [], + "alias_ensembl": ["NHDF_AD"] + }, + { + "type": "NK-Cells", + "alias_ucsc": [], + "alias_ensembl": ["Natural_Killer_cells_PB"] + }, + { + "type": "Cancers", + "alias_ucsc": [], + "alias_ensembl": ["HeLa_S3", "HepG2", "DND_41", "K562"] + }, + { + "type": "Brain", + "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], + "alias_ensembl": ["NH_A"] + }, + { + "type": "Mesenchym", + "alias_ucsc": ["mesenchyme derived from neural crest", "facial mesenchyme"], + "alias_ensembl": [] + }, + { + "type": "Embryonal", + "alias_ucsc": ["somite", "genital tubercle"], + "alias_ensembl": [] + }, + { + "type": "Eye", + "alias_ucsc": ["eye"], + "alias_ensembl": [] + }, + { + "type": "Nose", + "alias_ucsc": ["nose"], + "alias_ensembl": [] + }, + { + "type": "Tail", + "alias_ucsc": ["tail"], + "alias_ensembl": [] + }, + { + "type": "Melanocytes", + "alias_ucsc": ["melanocytes"], + "alias_ensembl": [] + } +]