diff --git a/README.md b/README.md index 88c6a01..91dbc8f 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# IsoTV \ No newline at end of file +# IsoTV + +Detailed documentation avaliable at https://isotv.readthedocs.io/. diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..9d7eeb1 --- /dev/null +++ b/Snakefile @@ -0,0 +1,707 @@ +# Siddharth Annaldasula +# required packages: Bio (SeqIO), os, pandas, numpy, subprocess, string, argparse, seaborn, matplotlib + +import os +import pandas as pd +import shutil +from matplotlib.backends.backend_pdf import PdfPages + +##### Logistics +configfile: "config_goeke.yaml" +workdir: config["outdir"] + +ReferenceFasta = config["genome_fasta"] +GenomeGFF = config["genome_annot"] +SAMPLES = config["samples"] + + +### All +rule all: + input: + config["output_plots"] + +# Package versions +rule dump_versions: + output: + ver = "versions.txt" + conda: + "environment.yaml" + shell: + "conda list > {output.ver}" + +###### Raw ONT Reads Processing + +# Basecalling +rule Basecalling: + input: + "RawData/{sample}/{id}.fast5" + output: + "RawData/QC/{sample}/{id}.txt", + "RawData/Fast5/{sample}/{id}/{id}.fast5", + fastq = "RawData/Basecalled/{sample}/{id}.fastq" + resources: + memory = 10, + time = 5, + tmpdir = 4 + params: + guppy = config["guppy"], + flowcell = config["flowcell"], + kit = config["kit"], + work = config["outdir"] + priority: 10 + group: "basecalling" + threads: 8 + shell: """ + mkdir -p {params.work}/RawData/Fast5/{wildcards.sample}/{wildcards.id} + + ln -sf {params.work}/{input} {params.work}/RawData/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5 + + {params.guppy} -i {params.work}/RawData/Fast5/{wildcards.sample}/{wildcards.id}/ -s {params.work}/RawData/Basecalled/{wildcards.sample}/{wildcards.id}/ \ + --flowcell {params.flowcell} --kit {params.kit} --num_callers 1 --cpu_threads_per_caller {threads} --fast5_out + + mv -f {params.work}/RawData/Basecalled/{wildcards.sample}/{wildcards.id}/workspace/{wildcards.id}.fast5 {params.work}/RawData/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5 + + mv {params.work}/RawData/Basecalled/{wildcards.sample}/{wildcards.id}/sequencing_summary.txt {params.work}/RawData/QC/{wildcards.sample}/{wildcards.id}.txt + + find {params.work}/RawData/Basecalled/{wildcards.sample}/{wildcards.id} -name '*.fastq' -exec mv {{}} {params.work}/RawData/Basecalled/{wildcards.sample}/{wildcards.id}.fastq \;""" + +rule aggregate_fast5: + input: + f5 = lambda wildcards: expand("RawData/Basecalled/{{sample}}/{id}.fastq", id = glob_wildcards("RawData/"+wildcards.sample+"/{id}.fast5")[0]), + output: + fq = "RawData/Fastq/{sample}"+".fastq" + resources: + memory = 50, + time = 4, + tmpdir = 4 + priority: 9 + run: + with open(output.fq,'w') as fout: + for fn in input.f5: + with open(fn,'r') as fin: + shutil.copyfileobj(fin, fout) + + #f5 = lambda wildcards: expand("RawData/Basecalled/{sample}/{id}.fastq"), + +# Build minimap2 index to reference genome +rule Minimap2Index: + input: + genome = ReferenceFasta + output: + index = "Results/Minimap2/"+ReferenceFasta.split("/")[-1]+".mmi" + resources: + memory = 32, + time = 1 + params: + opts = config["minimap2_index_opts"], + threads: + config["threads"] + shell: + "minimap2 -t {threads} {params.opts} -I 1000G -d {output.index} {input.genome}" + +# Filter basecalled raw ONT reads +rule FilterReads: + input: + lambda wildcards: "RawData/Fastq/"+SAMPLES[wildcards.sample]+".fastq" + output: + "FilteredData/{sample}.fastq", + resources: + memory = 32, + time = 1 + params: + config["min_mean_q"] + shell: + "filtlong --min_mean_q {params} {input} > {output}" + +# Pychopper +rule Pychopper: + input: + "FilteredData/{sample}.fastq", + output: + pdf = "Results/Pychopper/{sample}.pychopper_report.pdf", + fastq = "Results/Pychopper/{sample}.pychop.fastq", + stats = "Results/Pychopper/{sample}.pychop.stats", + scores = "Results/Pychopper/{sample}.pychop.scores", + unclass = "Results/Pychopper/{sample}.unclassified.fastq" + resources: + memory=32, + time = 4 + params: + opts = config["porechop_heu_stringency"] + run: + shell("cdna_classifier.py -b ReferenceData/cdna_barcodes.fas -r {output.pdf} -S {output.stats} -A {output.scores} -u {output.unclass} {input} {output.fastq}") + +# Build known splice junction bed file +rule SpliceJunctionIndex: + input: + GenomeGFF + output: + junc_bed = "ReferenceData/junctions.bed" + shell: + "paftools.js gff2bed {input} > {output.junc_bed}" + +# Map reads using minimap2 +rule Minimap2Pinfish: + input: + index = rules.Minimap2Index.output.index, + fastq = expand("Results/Pychopper/{sample}.pychop.fastq", sample=SAMPLES) if (config["pychopper"]==True) else expand("FilteredData/{sample}.fastq", sample=SAMPLES), + use_junc = rules.SpliceJunctionIndex.output.junc_bed if config["minimap2_opts_junction"] else "" + output: + bam = "Results/Minimap2/merged.mapping.bam" + resources: + memory = 32, + time = 4 + params: + opts = config["minimap2_opts"], + min_mq = config["minimum_mapping_quality"], + threads: config["threads"] + shell:""" + minimap2 -t {threads} -ax splice {params.opts} --junc-bed {input.use_junc} {input.index} {input.fastq}\ + | samtools view -q {params.min_mq} -F 2304 -Sb | samtools sort -@ {threads} - -o {output.bam}; + samtools index {output.bam} + """ + +# Convert BAM to GFF +rule PinfishRawBAM2GFF: + input: + bam = rules.Minimap2Pinfish.output.bam + output: + raw_gff = "Results/Pinfish/raw_transcripts.gff" + resources: + memory = 8, + time = 1 + params: + opts = config["spliced_bam2gff_opts"] + threads: config["threads"] + shell: + "spliced_bam2gff {params.opts} -t {threads} -M {input.bam} > {output.raw_gff}" + +# Cluster transcripts in GFF +rule PinfishClusterGFF: + input: + raw_gff = rules.PinfishRawBAM2GFF.output.raw_gff + output: + cls_gff = "Results/Pinfish/clustered_pol_transcripts.gff", + cls_tab = "Results/Pinfish/cluster_memberships.tsv", + resources: + memory = 8, + time = 1 + params: + c = config["minimum_cluster_size"], + d = config["exon_boundary_tolerance"], + e = config["terminal_exon_boundary_tolerance"], + min_iso_frac = config["minimum_isoform_percent"], + threads: config["threads"] + shell: + "cluster_gff -p {params.min_iso_frac} -t {threads} -c {params.c} -d {params.d} -e {params.e} -a {output.cls_tab} {input.raw_gff} > {output.cls_gff}" + +# Collapse clustered read artifacts +rule PinfishCollapseRawPartials: + input: + cls_gff = rules.PinfishClusterGFF.output.cls_gff + output: + cls_gff_col = "Results/Pinfish/clustered_transcripts_collapsed.gff" + resources: + memory = 8, + time = 1 + params: + d = config["collapse_internal_tol"], + e = config["collapse_three_tol"], + f = config["collapse_five_tol"], + shell: + "collapse_partials -d {params.d} -e {params.e} -f {params.f} {input.cls_gff} > {output.cls_gff_col}" + +# Polish read clusters +rule PinfishPolishClusters: + input: + cls_gff = rules.PinfishClusterGFF.output.cls_gff, + cls_tab = rules.PinfishClusterGFF.output.cls_tab, + bam = rules.Minimap2Pinfish.output.bam + output: + pol_trs = "Results/Pinfish/polished_transcripts.fas" + conda: + "/project/owlmayerTemporary/Sid/nanopore-analysis/environment.yaml" + resources: + memory = 8, + time = 1 + params: + c = config["minimum_cluster_size"] + threads: config["threads"] + shell: + "polish_clusters -t {threads} -a {input.cls_tab} -c {params.c} -o {output.pol_trs} {input.bam}" + +# Map polished transcripts to genome +rule MinimapPolishedClusters: + input: + index = rules.Minimap2Index.output.index, + fasta = rules.PinfishPolishClusters.output.pol_trs, + output: + pol_bam = "Results/Minimap2/polished_reads_aln_sorted.bam" + resources: + memory = 8, + time = 1 + params: + extra = config["minimap2_opts_polished"], + threads: config["threads"] + shell:""" + minimap2 -t {threads} {params.extra} -ax splice {input.index} {input.fasta}\ + | samtools view -Sb -F 2304 | samtools sort -@ {threads} - -o {output.pol_bam}; + samtools index {output.pol_bam} + """ + +# Convert BAM of polished transcripts to GFF +rule PinfishPolishedBAM2GFF: + input: + bam = rules.MinimapPolishedClusters.output.pol_bam + output: + pol_gff = "Results/Pinfish/polished_transcripts.gff" + resources: + memory = 8, + time = 1 + params: + extra = config["spliced_bam2gff_opts_pol"] + threads: config["threads"] + shell: + "spliced_bam2gff {params.extra} -t {threads} -M {input.bam} > {output.pol_gff}" + +# Collapse polished read artifacts +rule PinfishCollapsePolishedPartials: + input: + pol_gff = rules.PinfishPolishedBAM2GFF.output.pol_gff + output: + pol_gff_col = "Results/Pinfish/polished_transcripts_collapsed.gff" + resources: + memory = 8, + time = 1 + params: + d = config["collapse_internal_tol"], + e = config["collapse_three_tol"], + f = config["collapse_five_tol"], + shell: + "collapse_partials -d {params.d} -e {params.e} -f {params.f} {input.pol_gff} > {output.pol_gff_col}" + +rule GffCompare: + input: + reference = GenomeGFF, + exptgff = rules.PinfishCollapsePolishedPartials.output.pol_gff_col + output: + nanopore_gtf = "Results/GffCompare/nanopore.combined.gtf" + resources: + memory = 16, + time = 1 + shell: + "gffcompare -r {input.reference} -R -A -K -o " + "Results/GffCompare/nanopore {input.exptgff}" + +# Generate corrected transcriptome +rule PrepareCorrectedTranscriptomeFasta: + input: + genome = ReferenceFasta, + gff = rules.PinfishCollapsePolishedPartials.output.pol_gff_col, + output: + fasta = "Results/Pinfish/corrected_transcriptome_polished_collapsed.fas", + resources: + memory = 8, + time = 1 + shell: + "gffread -g {input.genome} -w {output.fasta} {input.gff}" + +rule PrepareCorrectedTranscriptomeFastaNonred: + input: + rules.PrepareCorrectedTranscriptomeFasta.output.fasta, + rules.GffCompare.output.nanopore_gtf + output: + fasta = "Results/Pinfish/corrected_transcriptome_polished_collapsed_nonred.fas", + resources: + memory = 8, + time = 1 + script: + "scripts/nonredundant.py" + +rule reads_to_transcripts: + input: + rules.PrepareCorrectedTranscriptomeFastaNonred.output.fasta, + rules.GffCompare.output.nanopore_gtf + output: + "Results/Pinfish/corrected_transcriptome_polished_collapsed_nonredundant_tcons.fas" + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + group: "preprocess_sequences" + script: + "scripts/reads_to_transcripts.py" + +# Build minimap2 index for transcriptome +rule Transcriptome2Index: + input: + genome = rules.PrepareCorrectedTranscriptomeFastaNonred.output.fasta, + output: + index = "Results/Minimap2/Transcriptome.mmi" + threads: config["threads"] + resources: + memory = 16, + time = 2 + shell: + "minimap2 -t {threads} -I 1000G -d {output.index} {input.genome}" + +# Map reads using minimap2 +rule Minimap2Genome: + input: + index = rules.Minimap2Index.output.index, + fastq = "FilteredData/{sample}.fastq", + use_junc = rules.SpliceJunctionIndex.output.junc_bed if config["minimap2_opts_junction"] else "" + output: + bam = "IGV/{sample}.genome.bam" + resources: + memory = 32, + time = 4 + params: + opts = config["minimap2_opts"] + threads: config["threads"] + shell:""" + minimap2 -t {threads} -ax splice {params.opts} --junc-bed {input.use_junc} {input.index} {input.fastq}\ + | samtools view -F 260 -Sb | samtools sort -@ {threads} - -o {output.bam}; + samtools index {output.bam} + """ + +# Map to transcriptome +rule Map2Transcriptome: + input: + index = rules.Transcriptome2Index.output.index, + fastq = "FilteredData/{sample}.fastq" + output: + bam = "Results/Quantification/{sample}.bam", + sbam = "Results/Quantification/{sample}.sorted.bam" + threads: config["threads"] + resources: + memory = 32, + time = 4 + params: + msec = config["maximum_secondary"], + psec = config["secondary_score_ratio"], + priority: 10 + shell:""" + minimap2 -ax map-ont -t {threads} -p {params.psec} -N {params.msec} {input.index} {input.fastq} | samtools view -Sb > {output.bam}; + samtools sort -@ {threads} {output.bam} -o {output.sbam}; + samtools index {output.sbam}; + """ + +rule ExtractPrimaryMapping: + input: + transcriptome = rules.Map2Transcriptome.output.sbam, + genome = rules.Minimap2Genome.output.bam + resources: + memory = 4, + time = 1 + output: + "IGV/{sample}.transcriptome.bam" + shell:""" + samtools view -b -F 260 {input.transcriptome} > {output}; + samtools index {output} + """ + +# Quantify transcripts +rule CountTranscripts: + input: + bam = rules.ExtractPrimaryMapping.output + resources: + memory = 4, + time = 1 + output: + quant = "Results/Quantification/{sample}.counts" + shell:""" + echo -e "counts\ttranscript" > {output.quant}; + samtools view {input.bam} | cut -f3 | uniq -c | grep -v "*" | sed -e 's/^[ \t]*//' | sed 's/ /\t/' >> {output.quant} + """ + +# Concatenate transcript counts +rule ConcatenateCounts: + input: + annotation = GenomeGFF, + transcriptome = rules.GffCompare.output.nanopore_gtf, + counts = expand("Results/Quantification/{sample}.counts", sample=SAMPLES), + resources: + memory = 4, + time = 1 + output: + counts = "Results/Quantification/counts.txt" + script: + "scripts/concatenate.py" + +# Normalize isoform levels +rule CalculateDeSeq2Norm: + input: + counts = rules.ConcatenateCounts.output + resources: + memory = 4, + time = 1 + output: + norm_counts = "Results/Quantification/counts_deseq2norm.txt" + script: + "scripts/deseq2norm.R" + +###### Isoform Transcript Processing + +# Use correct file paths +CountsData = rules.CalculateDeSeq2Norm.output if config["preprocess"] else config["counts_data"] +Transcripts = rules.reads_to_transcripts.output if config["preprocess"] else config["polished_reads"] +NanoporeGTF = rules.GffCompare.output.nanopore_gtf if config["preprocess"] else config["nanopore_gtf"] + +# Filter genes that are not found in the analysis, or if their analysis has already been done +checkpoint gene_filter: + input: + NanoporeGTF if config["annotation"] else Transcripts + output: + directory("Results/Genes_temp") + params: + genes_final_dir = "Results/Genes" + script: + "scripts/filter_genes.py" + +# Extract transcripts for each gene +rule gene_to_transcripts: + input: + NanoporeGTF, + Transcripts, + "Results/Genes_temp/{gene}" + output: + "Results/Genes_temp/{gene}/{gene}_seq.fa" + params: + gene = "{gene}" + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + group: "preprocess_sequences" + script: + "scripts/gene_to_transcripts.py" + +# Transcript isoform translation +checkpoint translation_to_protein: + input: + seq = rules.gene_to_transcripts.output + output: + transcripts = directory("Results/Genes_temp/{gene}/transcripts"), + stats = "Results/Genes_temp/{gene}/{gene}_transcripts_stats.txt" + params: + gene = "{gene}" + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 10 + threads: 1 + #group: "preprocess_sequences" + script: + "scripts/translation_to_protein.py" + +###### Translated Isoform Feature Analysis + +# Disorder region analysis +rule iupred2a_analysis: + input: + protein = "Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa", + iupred2a = config["iupred2a_path"] + output: + seq = "Results/Genes_temp/{gene}/transcripts/{transcript}_sequence.txt", + idr = "Results/Genes_temp/{gene}/transcripts/{transcript}_func_iupred2a.txt" + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + group: "sequence_analysis" + shell: + "echo {input.protein} >> {output.seq}; {input.iupred2a} -a {output.seq} long >> {output.idr}" + +# Domain analysis +rule interpro_scan: + input: + translation = "Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa" + output: + "Results/Genes_temp/{gene}/transcripts/{transcript}_func_domains.gff3" + params: + db = "Pfam", + java = config["java"], + pfam = config["interproScan_path"], + resources: + memory = 16, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + group: "sequence_analysis" + shell: + "source {params.java} ; sh {params.pfam} -i {input.translation} -o {output} -f GFF3 -appl {params.db} -dra" + +# Secondary structure analysis +rule porter_analysis: + input: + "Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa" + output: + "Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa.ss3" + params: + brewery = config["brewery_path"], + lock1 = config["lock1"], + lock2 = config["lock2"], + resources: + memory = 72, + time = 1, + tmpdir = 16 + #memory = 16, + #time = 1, + #tmpdir = 0 + priority: 10 + threads: 16 + #group: "sequence_analysis" + shell: + "python3 {params.brewery} -i {input} --cpu 16 --noTA --noSA --noCD --fast" +# 'set -ve; export ORIGDIR="$(/bin/pwd)"; flock {params.lock1} bash -ve -c "scp {input} $MXQ_JOB_TMPDIR/"; cd $MXQ_JOB_TMPDIR/; python3 {params.brewery} -i *.fa --cpu 16 --noTA --noSA --noCD --fast; flock {params.lock2} bash -ve -c "cp *.fa.ss3 $ORIGDIR/{output}"' +# "echo {params.brewery} > {output}" + +# PTM analysis +rule functional_site_analysis: + input: + "Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa" + output: + "Results/Genes_temp/{gene}/transcripts/{transcript}_func_sites.txt" + params: + ps_scan = config["prositeScan_path"], + prosite_dat = config["prositeDat_path"], + pf_scan = config["pfScan_path"], + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + group: "sequence_analysis" + shell: + "perl {params.ps_scan} -d {params.prosite_dat} --pfscan {params.pf_scan} {input} > {output}" + +# Prion analysis +rule prion_analysis: + input: + "Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa" + output: + "Results/Genes_temp/{gene}/transcripts/{transcript}_func_prion.txt" + params: + plaac = config["plaac_path"], + gene = "{gene}" + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + group: "sequence_analysis" + shell: + "java -jar {params.plaac} -i {input} -p all > {output}" + +# Combine functional feature analysis on individual isoforms +def functional_files(): + files = [] + if (config["aa"]): files.append("Results/Genes_temp/{gene}/transcripts/{transcript}_protein.fa") + if (config["iupred2a"]): files.append(rules.iupred2a_analysis.output.idr) + if (config["pfam"]): files.append(rules.interpro_scan.output) + if (config["porter"]): files.append(rules.porter_analysis.output) + if (config["pfScan"]): files.append(rules.functional_site_analysis.output) + if (config["prion"]): files.append(rules.prion_analysis.output) + return files +rule individual_transcript_analysis: + input: + functional_files() + output: + "Results/Genes_temp/{gene}/transcripts/{transcript}_analysis.txt" + params: + gene = "{gene}" + resources: + memory = 10, + time = 1, + tmpdir = 0 + priority: 5 + threads: 1 + script: + "scripts/individual_transcript_analysis.py" + +def aggregate_transcript_analysis_func(wildcards): + checkpoint_output_gene = checkpoints.gene_filter.get(**wildcards).output[0] + checkpoint_output = checkpoints.translation_to_protein.get(**wildcards).output[0] + return expand("Results/Genes_temp/{gene}/transcripts/{transcript}_analysis.txt", + gene = wildcards.gene, + transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_protein.fa")).transcript) + +rule aggregate_transcript_analysis: + input: + aggregate_transcript_analysis_func + output: + "Results/Genes_temp/{gene}/{gene}_transcripts_filtered_analysis.txt" + params: + gene = "{gene}" + resources: + memory = 4, + time = 1, + tmpdir = 0 + priority: 5 + threads: 1 + shell: + "cat {input} > {output}" + +rule protein_coding_potential_analysis: + input: + rules.aggregate_transcript_analysis.output, + rules.translation_to_protein.output.stats, + CountsData, + NanoporeGTF, + output: + "Results/Genes_temp/{gene}/{gene}_functional_analysis.pdf" + params: + gene = "{gene}" + resources: + memory = 16, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + script: + "scripts/visualization.py" + +rule copy_genes_temp: + input: + gene_plot = "Results/Genes_temp/{gene}/{gene}_functional_analysis.pdf" + output: + gene_folder = directory("Results/Genes/{gene}"), + plot = "Results/Plots/{gene}_functional_analysis.pdf" + resources: + memory = 16, + time = 1, + tmpdir = 0 + priority: 8 + threads: 1 + shell: + "mkdir {output.gene_folder}; cp {input.gene_plot} {output.plot}; cp -r {input} {output.gene_folder};" + +def aggregate_protein_coding_potential_analysis(wildcards): + checkpoint_output = checkpoints.gene_filter.get(**wildcards).output[0] + return expand("Results/Plots/{gene}_functional_analysis.pdf", + gene=glob_wildcards(os.path.join(checkpoint_output,"{gene}.txt")).gene) + +rule output_combine_files: + input: + aggregate = aggregate_protein_coding_potential_analysis, + gene_temp = directory("Results/Genes_temp"), + output: + plot = "Results/Output/" + config["output_plots"] + resources: + memory = 4, + time = 1, + tmpdir = 0 + priority: 5 + threads: 1 + shell: + "rm -rf {input.gene_temp}; pdfunite {input.aggregate} {output.plot};" diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..cf1f3bd --- /dev/null +++ b/config.yaml @@ -0,0 +1,155 @@ +##### IsoTV Config File +pipeline: "IsoTV" +repo: "https://github.molgen.mpg.de/MayerGroup/IsoTV" + +### Ouput directory + +outdir: "/path/to/output" + +### General pipeline parameters: + +basecalling: FALSE +preprocess: TRUE +annotation: TRUE +quantification: TRUE + +###### ONT long read processing config +### Basecalling pipeline parameters + +guppy: "/path/to/Guppy324/bin/guppy_basecaller" +flowcell: FLO-MIN106 +kit: SQK-DCS109 + +### Reference Files +genome_fasta: "/path/to/GRCh38.p12.primary_assembly.genome.fa" +genome_annot: "/path/to/gencode.v32.primary_assembly.annotation.gtf" + +### Samples +# samples with .bottom extension must be placed in the RawData folder +# condition_replicate +samples: + A549_1: "A549_r1_r3" + A549_2: "A549_r2_r1" + #A549_3: "A549_r3_r2" + A549_5: "A549_r5_r3" + HCT116_1: "HCT116_r1_r4" + HCT116_3: "HCT116_r3_r2" + HCT116_4: "HCT116_r4_r1" + HCT116_5: "HCT116_r5_r1" + HEPG2_1: "HEPG2_r1_r1" + HEPG2_4: "HEPG2_r4_r2" + HEPG2_5: "HEPG2_r5_r3" + K562_1: "K562_r1_r2" + K562_2: "K562_r2_r1" + K562_3: "K562_r3_r1" + K562_4: "K562_r4_r2" + MCF7_1: "MCF7_r1_r2" + MCF7_3: "MCF7_r3_r3" + MCF7_4: "MCF7_r4_r2" + +threads: 16 + +# Use pychopper results +pychopper: TRUE + +# Use annotation to improve splice junction mapping (minimap2 --junc_bed parameter) +minimap2_opts_junction: TRUE + +# Minimum read quality to keep: +min_mean_q: 5 + +# Stringency of porechop heuristic: +porechop_heu_stringency: 0.25 + +# Options passed to minimap2 during indexing: +minimap2_index_opts: "-k14" + +# Extra options passed to minimap2: +minimap2_opts: "-uf" # required for stranded data e.g. when pychopper filtered + +# Minmum mapping quality: +minimum_mapping_quality: 5 + +# Options passed to spliced_bam2gff: +spliced_bam2gff_opts: "-s" # required for stranded data e.g. when pychopper filtered + +# -c parameter: +minimum_cluster_size: 3 + +# -p parameter: +minimum_isoform_percent: 1 + +# -d parameter: +exon_boundary_tolerance: 10 + +# -e parameter: +terminal_exon_boundary_tolerance: 50 + +# Extra options passed to minimap2 when mapping polished reads: +minimap2_opts_polished: "-uf" # required for stranded data e.g. when pychopper filtered + +# Options passed to spliced_bam2gff when converting alignments of polished reads: +spliced_bam2gff_opts_pol: "-s" # required for stranded data e.g. when pychopper filtered + +# Options passed to collapse_partials when collapsing fragmentation artifacts +# Internal exon boundary tolerance: +collapse_internal_tol: 5 + +# Five prime boundary tolerance: +collapse_five_tol: 500 + +# Three prime boundary tolerance: +collapse_three_tol: 50 + +maximum_secondary: 200 +secondary_score_ratio: 1 + +##### Feature Analysis Config +### Input genes + +gene_file: "/path/to/genes.tab" + +### Output file and folder + +output_plots: "loveisgone.pdf" + +### Processed file paths - required if not using ONT long read processing workflow + +nanopore_gtf: "/path/to/Results/GffCompare/nanopore.combined_filt.gtf" +polished_reads: "/path/to/Results/Pinfish/corrected_transcriptome_polished_collapsed_tcons_nonredundant.fas" +counts_data: "/path/to/Results/Quantification_nonred/all_counts_deseq2norm_all.txt" + +# Is data continuous +continuous: FALSE + +### Tools paths and functional analysis + +aa: TRUE + +iupred2a_path: "/path/to/iupred2a/iupred2a.py" +iupred2a: TRUE + +brewery_path: "/path/to/Brewery/Brewery.py" +porter: TRUE + +interproScan_path: "/path/to/my_interproscan/interproscan-5.38-76.0/interproscan.sh" +pfam: TRUE + +prositeScan_path: "/path/to/ps_scan/ps_scan.pl" +pfScan_path: "/path/to/ps_scan/pfscan" +prositeDat_path: "/path/to/prosite.dat" +pfScan: TRUE + +#plaac_path: "/path/to/plaac.jar" +#prion: FALSE + +minIsoTPM: 1 +maxIsoNum: 8 +minIsoPct: 10 + +### Misc paths + +java : "/pkg/openjdk-11.0.3.2-0/profile" + +lock1: "/.../lock.txt" +lock2: "/.../lock2.txt" diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..149c2e7 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,205 @@ +name: isotv_env +channels: + - sagrudd + - bioconda + - defaults + - conda-forge +dependencies: + - _libgcc_mutex=0.1=main + - _r-mutex=1.0.0=anacondar_1 + - binutils_impl_linux-64=2.33.1=he6710b0_7 + - binutils_linux-64=2.33.1=h9595d00_15 + - bioconductor-annotate=1.64.0=r36_0 + - bioconductor-annotationdbi=1.48.0=r36_0 + - bioconductor-biobase=2.46.0=r36h516909a_0 + - bioconductor-biocgenerics=0.32.0=r36_0 + - bioconductor-biocparallel=1.20.0=r36he1b5a44_0 + - bioconductor-delayedarray=0.12.0=r36h516909a_0 + - bioconductor-deseq2=1.26.0=r36he1b5a44_0 + - bioconductor-genefilter=1.68.0=r36hc99cbb1_0 + - bioconductor-geneplotter=1.64.0=r36_0 + - bioconductor-genomeinfodb=1.22.0=r36_0 + - bioconductor-genomeinfodbdata=1.2.2=r36_0 + - bioconductor-genomicranges=1.38.0=r36h516909a_0 + - bioconductor-iranges=2.20.0=r36h516909a_0 + - bioconductor-s4vectors=0.24.0=r36h516909a_0 + - bioconductor-summarizedexperiment=1.16.0=r36_0 + - bioconductor-xvector=0.26.0=r36h516909a_0 + - bioconductor-zlibbioc=1.32.0=r36h516909a_0 + - blas=1.0=mkl + - bwidget=1.9.11=1 + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2020.10.14=0 + - cairo=1.14.12=h8948797_3 + - certifi=2020.6.20=py36_0 + - curl=7.69.1=hbc83047_0 + - cycler=0.10.0=py36_0 + - dbus=1.13.18=hb2f20db_0 + - expat=2.2.10=he6710b0_2 + - filtlong=0.2.0=he513fc3_3 + - fontconfig=2.13.0=h9420a91_0 + - freetype=2.10.4=h5ab3b9f_0 + - fribidi=1.0.10=h7b6447c_0 + - gcc_impl_linux-64=7.3.0=habb00fd_1 + - gcc_linux-64=7.3.0=h553295d_15 + - gffcompare=0.11.2=hc9558a2_1 + - gfortran_impl_linux-64=7.3.0=hdf63c60_1 + - gfortran_linux-64=7.3.0=h553295d_15 + - glib=2.66.1=h92f7085_0 + - graphite2=1.3.14=h23475e2_0 + - gsl=2.4=h14c3975_4 + - gst-plugins-base=1.14.0=hbbd80ab_1 + - gstreamer=1.14.0=hb31296c_0 + - gxx_impl_linux-64=7.3.0=hdf63c60_1 + - gxx_linux-64=7.3.0=h553295d_15 + - harfbuzz=2.4.0=hca77d97_1 + - hmmer=3.3.1=he1b5a44_0 + - icu=58.2=he6710b0_3 + - intel-openmp=2020.2=254 + - jpeg=9b=h024ee3a_2 + - k8=0.2.5=he513fc3_0 + - kiwisolver=1.2.0=py36hfd86e86_0 + - krb5=1.17.1=h173b8e3_0 + - lcms2=2.11=h396b838_0 + - ld_impl_linux-64=2.33.1=h53a641e_7 + - libcurl=7.69.1=h20c2e04_0 + - libedit=3.1.20191231=h14c3975_1 + - libffi=3.3=he6710b0_2 + - libgcc-ng=9.1.0=hdf63c60_0 + - libgfortran-ng=7.3.0=hdf63c60_0 + - libpng=1.6.37=hbc83047_0 + - libssh2=1.9.0=h1ba5d50_1 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - libtiff=4.1.0=h2733197_1 + - libuuid=1.0.3=h1bed415_2 + - libxcb=1.14=h7b6447c_0 + - libxml2=2.9.10=hb55368b_3 + - lz4-c=1.9.2=heb0550a_3 + - make=4.2.1=h1bed415_1 + - matplotlib=3.3.2=0 + - matplotlib-base=3.3.2=py36h817c723_0 + - minimap2=2.17=hed695b0_3 + - mkl=2020.2=256 + - mkl-service=2.3.0=py36he904b0f_0 + - mkl_fft=1.2.0=py36h23d657b_0 + - mkl_random=1.1.1=py36h0573a6f_0 + - ncurses=6.2=he6710b0_1 + - numpy=1.19.2=py36h54aff64_0 + - numpy-base=1.19.2=py36hfa32c7d_0 + - olefile=0.46=py36_0 + - openssl=1.1.1h=h7b6447c_0 + - pandas=1.1.3=py36he6710b0_0 + - pango=1.45.3=hd140c19_0 + - parasail-python=1.2=py36h8b12597_0 + - pcre=8.44=he6710b0_0 + - pillow=8.0.0=py36h9a89aac_0 + - pinfish=200316=2bc39d4_1 + - pip=20.2.4=py36_0 + - pixman=0.40.0=h7b6447c_0 + - pychopper=2.5.0=py_0 + - pyparsing=2.4.7=py_0 + - pyqt=5.9.2=py36h05f1152_2 + - python=3.6.12=hcff3b4d_2 + - python-dateutil=2.8.1=py_0 + - python-edlib=1.3.8.post1=py36hc9558a2_0 + - pytz=2020.1=py_0 + - qt=5.9.7=h5867ecd_1 + - r-acepack=1.4.1=r36ha65eedd_0 + - r-assertthat=0.2.1=r36h6115d3f_0 + - r-backports=1.1.4=r36h96ca727_0 + - r-base=3.6.1=haffb61f_2 + - r-base64enc=0.1_3=r36h96ca727_4 + - r-bh=1.69.0_1=r36h6115d3f_0 + - r-bit=1.1_14=r36h96ca727_0 + - r-bit64=0.9_7=r36h96ca727_0 + - r-bitops=1.0_6=r36h96ca727_4 + - r-blob=1.1.1=r36h6115d3f_0 + - r-checkmate=1.9.1=r36h96ca727_0 + - r-cli=1.1.0=r36h6115d3f_0 + - r-cluster=2.0.8=r36ha65eedd_0 + - r-colorspace=1.4_1=r36h96ca727_0 + - r-crayon=1.3.4=r36h6115d3f_0 + - r-data.table=1.12.2=r36h96ca727_0 + - r-dbi=1.0.0=r36h6115d3f_0 + - r-dichromat=2.0_0=r36h6115d3f_4 + - r-digest=0.6.18=r36h96ca727_0 + - r-evaluate=0.13=r36h6115d3f_0 + - r-fansi=0.4.0=r36h96ca727_0 + - r-foreign=0.8_71=r36h96ca727_0 + - r-formatr=1.6=r36h6115d3f_0 + - r-formula=1.2_3=r36h6115d3f_0 + - r-futile.logger=1.4.3=r36h6115d3f_1003 + - r-futile.options=1.0.1=r36h6115d3f_0 + - r-ggplot2=3.1.1=r36h6115d3f_0 + - r-glue=1.3.1=r36h96ca727_0 + - r-gridextra=2.3=r36h6115d3f_0 + - r-gtable=0.3.0=r36h6115d3f_0 + - r-highr=0.8=r36h6115d3f_0 + - r-hmisc=4.2_0=r36ha65eedd_0 + - r-htmltable=1.13.1=r36h6115d3f_0 + - r-htmltools=0.3.6=r36h29659fb_0 + - r-htmlwidgets=1.3=r36h6115d3f_0 + - r-jsonlite=1.6=r36h96ca727_0 + - r-knitr=1.22=r36h6115d3f_0 + - r-labeling=0.3=r36h6115d3f_4 + - r-lambda.r=1.2.3=r36h6115d3f_0 + - r-lattice=0.20_38=r36h96ca727_0 + - r-latticeextra=0.6_28=r36h6115d3f_0 + - r-lazyeval=0.2.2=r36h96ca727_0 + - r-locfit=1.5_9.4=r36hcdcec82_1 + - r-magrittr=1.5=r36h6115d3f_4 + - r-markdown=0.9=r36h96ca727_0 + - r-mass=7.3_51.3=r36h96ca727_0 + - r-matrix=1.2_17=r36h96ca727_0 + - r-matrixstats=0.54.0=r36h96ca727_0 + - r-memoise=1.1.0=r36h6115d3f_0 + - r-mgcv=1.8_28=r36h96ca727_0 + - r-mime=0.6=r36h96ca727_0 + - r-munsell=0.5.0=r36h6115d3f_0 + - r-nlme=3.1_139=r36ha65eedd_0 + - r-nnet=7.3_12=r36h96ca727_0 + - r-pillar=1.3.1=r36h6115d3f_0 + - r-pkgconfig=2.0.2=r36h6115d3f_0 + - r-plogr=0.2.0=r36h6115d3f_0 + - r-plyr=1.8.4=r36h29659fb_0 + - r-prettyunits=1.0.2=r36h6115d3f_0 + - r-r6=2.4.0=r36h6115d3f_0 + - r-rcolorbrewer=1.1_2=r36h6115d3f_0 + - r-rcpp=1.0.1=r36h29659fb_0 + - r-rcpparmadillo=0.9.300.2.0=r36h29659fb_0 + - r-rcurl=1.95_4.12=r36h96ca727_0 + - r-reshape2=1.4.3=r36h29659fb_0 + - r-rlang=0.3.4=r36h96ca727_0 + - r-rpart=4.1_15=r36h96ca727_0 + - r-rsqlite=2.1.1=r36h29659fb_0 + - r-rstudioapi=0.10=r36h6115d3f_0 + - r-scales=1.0.0=r36h29659fb_0 + - r-snow=0.4_3=r36h6115d3f_0 + - r-stringi=1.4.3=r36h29659fb_0 + - r-stringr=1.4.0=r36h6115d3f_0 + - r-survival=2.44_1.1=r36h96ca727_0 + - r-tibble=2.1.1=r36h96ca727_0 + - r-utf8=1.1.4=r36h96ca727_0 + - r-viridis=0.5.1=r36h6115d3f_0 + - r-viridislite=0.3.0=r36h6115d3f_0 + - r-withr=2.1.2=r36h6115d3f_0 + - r-xfun=0.6=r36h6115d3f_0 + - r-xml=3.98_1.19=r36h96ca727_0 + - r-xtable=1.8_4=r36h6115d3f_0 + - r-yaml=2.2.0=r36h96ca727_0 + - racon=1.4.13=he513fc3_0 + - readline=8.0=h7b6447c_0 + - scipy=1.5.2=py36h0b6359f_0 + - seaborn=0.11.0=py_0 + - setuptools=50.3.0=py36hb0f4dca_1 + - sip=4.19.8=py36hf484d3e_0 + - six=1.15.0=py_0 + - sqlite=3.33.0=h62c20be_0 + - tk=8.6.10=hbc83047_0 + - tktable=2.10=h14c3975_0 + - tornado=6.0.4=py36h7b6447c_1 + - tqdm=4.7.2=py36_0 + - wheel=0.35.1=py_0 + - xz=5.2.5=h7b6447c_0 + - zlib=1.2.11=h7b6447c_3 + - zstd=1.4.5=h9ceee32_0 diff --git a/scripts/concatenate.py b/scripts/concatenate.py new file mode 100644 index 0000000..63d3aaa --- /dev/null +++ b/scripts/concatenate.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# Concatentate and annotate isoforms and their expression levels + +import pandas as pd + +# Concatenate datasets +l=[] +for fn in snakemake.input[2:]: + df=pd.read_csv(fn,delimiter='\t',usecols=[0,1],index_col=[1]) + df=df.rename(columns={'counts':fn.split('.counts')[0].split('/')[-1]}) + l.append(df) +df=l[0] +for i in range(1,len(l)): + df=df.join(l[i],how='outer') +df=df.fillna(0) + +# Get connection to reference +ds=pd.read_csv(snakemake.input[1],delimiter='\t',usecols=[2,8],header=None, names=['type','info']) +ds=ds[ds["type"]=='transcript'] +ds['transcript']=ds.apply(lambda x: x['info'].split('oId "')[1].split('";')[0],1) +ds['transcript_id']=ds.apply(lambda x: x['info'].split('transcript_id "')[1].split('";')[0],1) +ds['class_code']=ds.apply(lambda x: x['info'].split('class_code "')[1].split('";')[0],1) +ds['gene_name']=ds.apply(lambda x: '' if x['class_code']=='u' else x['info'].split('gene_name "')[1].split('";')[0],1) +ds['ref_transcript']=ds.apply(lambda x: '' if x['class_code']=='u' else x['info'].split('cmp_ref "')[1].split('";')[0],1) +ds=ds.set_index('transcript') +df=df.join(ds,how='left') + +# Get gene names +d=[] +with open(snakemake.input[0],'r') as af: + for linia in af: + if linia[0]!='#': + ln=linia.strip().split() + if ln[2]=='gene': + gene_name=linia.split('gene_name "')[1].split('"')[0] + gene_id=linia.split('gene_id "')[1].split('"')[0] + gene_type=linia.split('gene_type "')[1].split('"')[0] + d.append([gene_name,gene_id,gene_type]) +d=pd.DataFrame(data=d,columns=['gene_name','gene_id','gene_type']) + +df=df.merge(d,how='left',on='gene_name') +df=df.drop(columns=['info','type']) +cols = list(df.columns) +cols_new = ["class_code"] + cols[:cols.index("class_code")] + cols[cols.index("class_code")+ 1:] +df=df.reindex(columns = cols_new) +df.index = df["transcript_id"] +df.sort_index().to_csv(snakemake.output[0],index=False) diff --git a/scripts/deseq2norm.R b/scripts/deseq2norm.R new file mode 100644 index 0000000..5a05568 --- /dev/null +++ b/scripts/deseq2norm.R @@ -0,0 +1,27 @@ +# Normalize isoform expression across all conditions + +library("DESeq2") + +samples <- snakemake@config["samples"]$samples +data <- read.csv(file=snakemake@input[["counts"]], header=TRUE, sep=",") +countData <- data[colnames(data)[2:(length(samples) + 1)]] +countData <- as.matrix(countData) + +name <- colnames(data)[2:(length(samples) + 1)] +condition <- sapply(strsplit(name,"_"), `[`, 1) +colData <- cbind(name,condition) + +dds <- DESeqDataSetFromMatrix(countData,colData,design=~condition) +dds <- estimateSizeFactors(dds) +Pcounts <- counts(dds, normalized=TRUE) + +tpm <- function(x) { + x * 1000000/sum(x) +} + +Pcounts <- apply(Pcounts,2,tpm) + +data[colnames(data)[2:(length(samples) + 1)]] <- Pcounts + +data <- data[order(data$transcript_id),] +write.table(data,file=snakemake@output[["norm_counts"]],row.names = FALSE,quote=FALSE,sep=",") diff --git a/scripts/filter_genes.py b/scripts/filter_genes.py new file mode 100644 index 0000000..318f153 --- /dev/null +++ b/scripts/filter_genes.py @@ -0,0 +1,47 @@ +# Determine which genes to analyze in the input + +import pandas as pd +import os +from Bio import SeqIO + + +# Determines if a gene already exists, and create a dummy file and folder if it does +def gene_exists_file_func(gene): + gene_temp_dir = os.path.join(snakemake.output[0],gene) + gene_final_dir = os.path.join(snakemake.params["genes_final_dir"],gene) + + gene_exists_file = os.path.join(snakemake.output[0],"%s.txt" %(gene)) + if (not os.path.isfile(gene_exists_file)): + f = open(gene_exists_file, "w+") + f.close() + + if ((not os.path.isdir(gene_final_dir)) and (not os.path.isdir(gene_temp_dir))): + os.makedirs(gene_temp_dir) + + +gene_file = snakemake.config["gene_file"] +genes = list(set(pd.read_csv(gene_file, sep = "\t", names = ["gene_symbol"])["gene_symbol"])) +os.makedirs(snakemake.output[0]) + +# If the annotation file is given +if (snakemake.config["annotation"]): + annotate_df = pd.read_csv(snakemake.input[0], sep = "\t", header = None) + annotate_df = annotate_df[annotate_df[2] != "exon"] + annotate_lines = list(annotate_df[8]) + + for ann in range(len(annotate_lines)): + if ("gene_name" in annotate_lines[ann]): + gene = annotate_lines[ann].split('gene_name "')[1].split('"')[0] + if (gene in genes): + gene_exists_file_func(gene) +else: # If only the transcriptome file is given + transcripts_filename = snakemake.input[0] + transcripts = SeqIO.index(transcripts_filename, "fasta") + try: + for transcript in transcripts: + gene = transcipts[transcript].id.split("|")[1] + if (gene in genes): + gene_exists_file_func(gene) + except: + for gene in genes: + gene_exists_file_func(gene) diff --git a/scripts/gene_to_transcripts.py b/scripts/gene_to_transcripts.py new file mode 100644 index 0000000..ecd0f1c --- /dev/null +++ b/scripts/gene_to_transcripts.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Extract the transcripts that are linked to their respective gene + +import pandas as pd +import numpy as np +from Bio import SeqIO + +# Import transcript isoform sequences +transcripts_filename = snakemake.input[1] +transcripts = SeqIO.index(transcripts_filename, "fasta") + +output = [] +gene_name = snakemake.params[0] + +# If the annotation file is given +if (snakemake.config["annotation"]): + # Import the nanopore annotation file + annotation_filename = snakemake.input[0] + annotate_df = pd.read_csv(annotation_filename,sep = "\t", header = None) + #annotation = pd.read_csv(sep = "\t", skiprows = 5, names = ["chr","type","info"],usecols = [0,2,8]) + #annotate_df = pd.read_csv(annotation_filename,sep = "\t", skiprows = 5, names = ["chr","type","info"],usecols = [0,2,8], header = None) + + annotate_df = annotate_df[annotate_df[2] != "exon"] + annotate_lines = list(annotate_df[8]) + + # Mapping gene name to transcriptID + gene_tID = dict() + + for ann in range(len(annotate_lines)): + if ("gene_name" in annotate_lines[ann]): + line = annotate_lines[ann].split(";") + #tID = line[0].split(" ")[-1][1:-1] + tID = annotate_lines[ann].split('transcript_id "')[1].split('"')[0] + #gene = line[2].split(" ")[-1][1:-1] + gene = annotate_lines[ann].split('gene_name "')[1].split('"')[0] + if (gene not in gene_tID): gene_tID[gene] = [tID] + else: gene_tID[gene].append(tID) + + # Extracting isoforms from related genes + try: + gene_tID[gene_name] + except KeyError as error: + print("The gene %s was not found in the annotation file." %error) + raise + + for tID in gene_tID[gene_name]: + output.append(">" + tID) + output.append(str(transcripts[tID].seq)) +# If only the transcriptome file is given +else: + for transcript in transcripts: + try: + if (gene_name == transcipts[transcript].id.split("|")[1]): + output.append(">" + transcipts[transcript].id.split("|")[0]) + output.append(str(transcipts[transcript].seq)) + except: + output.append(">" + transcipts[transcript].id.split("|")[0]) + output.append(str(transcipts[transcript].seq)) + + if (output == []): + print("The gene %s was not found in the transcriptome file.") + raise + +output_filename = snakemake.output[0] +output_file = open(output_filename,"w+") +output_file.write("\n".join(output)) +output_file.close() diff --git a/scripts/individual_transcript_analysis.py b/scripts/individual_transcript_analysis.py new file mode 100644 index 0000000..0c5a1ac --- /dev/null +++ b/scripts/individual_transcript_analysis.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +# Individual transcript analysis + +count = 0 +output = [">" + snakemake.wildcards["transcript"]] + +for filename in snakemake.input: + file = open(filename,"r") + lines = file.readlines() + file.close() + + if("ss" in filename): + output.append("#####SS Analysis") + for position in lines: + output.append(position.strip()) + + elif ("iupred2a" in filename): + output.append("#####IUPred2A Analysis") + lines = lines[7:] + for position in lines: + output.append(position.strip()) + + elif("domains" in filename): + output.append("#####Domain Analysis") + lines = lines[5:] + for region in lines: + if (not region.startswith("#")): + region_split = region.strip().split("\t") + if (region_split[1] == "Pfam"): + region_short = "\t".join([region_split[3],region_split[4],region_split[8].split(";")[3].split("=")[-1]]) + output.append(region_short.strip()) + else: + break + + elif("prion" in filename): + output.append("#####Prion Analysis") + for prionsite in lines: + if (not prionsite.startswith("#")): + prionsite_split = prionsite.strip().split("\t") + prionsite_short = "\t".join([prionsite_split[2],prionsite_split[15]]) + output.append(prionsite_short) + + elif("sites" in filename): + output.append("#####Prosite Analysis") + modification = "" + site_only = False + for site in lines: + if (site.startswith(">")): + modification = site.strip().split(" ")[3] + site_only = "site" in site + else: + site_split = site.strip().replace(" "," ").split(" ") + if ((len(site_split) > 2) and site_only): + site_short = "\t".join([site_split[0],site_split[2],modification]) + output.append(site_short) + + elif ("protein" in filename): + output.append("#####AA Sequence") + lines = lines[1:] + output.append(lines[0].strip()) + +output.append("") + +output_file = open(snakemake.output[0],"w+") +output_file.write("\n".join(output)) +output_file.close() diff --git a/scripts/nonredundant.py b/scripts/nonredundant.py new file mode 100644 index 0000000..fbe2434 --- /dev/null +++ b/scripts/nonredundant.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +# Select only non-redundant transcripts + +import pandas as pd +from Bio import SeqIO + +transcripts_filename = snakemake.input[0] +transcripts = SeqIO.index(transcripts_filename, "fasta") + +ds=pd.read_csv(snakemake.input[1],delimiter='\t',usecols=[2,8],header=None, names=['type','info']) +ds=ds[ds["type"]=='transcript'] +ds['transcript_id']=ds.apply(lambda x: x['info'].split('oId "')[1].split('";')[0],1) +non_redundant_tid = set(ds["transcript_id"]) +output = [] +for transcript in transcripts: + if (transcript in non_redundant_tid): + output.append(">" + transcript.strip()) + output.append(str(transcripts[transcript].seq).strip()) +output_filename = snakemake.output[0] +output_file = open(output_filename, "w+") +output_file.writelines("\n".join(output)) +output_file.close() diff --git a/scripts/reads_to_transcripts.py b/scripts/reads_to_transcripts.py new file mode 100644 index 0000000..2415780 --- /dev/null +++ b/scripts/reads_to_transcripts.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Converts transcriptome file from reads (oID names) to transcripts name (TCONS) + +from Bio import SeqIO +import pandas as pd + +transcripts_filename = snakemake.input[0] +transcripts = SeqIO.index(transcripts_filename, "fasta") + +annotation_filename = snakemake.input[1] + +ds=pd.read_csv(annotation_filename,delimiter='\t',usecols=[2,8],header=None, names=['type','info']) +ds=ds[ds["type"]=='transcript'] +ds['transcript']=ds.apply(lambda x: x['info'].split('oId "')[1].split('";')[0],1) +ds['transcript_id']=ds.apply(lambda x: x['info'].split('transcript_id "')[1].split('";')[0],1) +test = dict() +index = list(ds['transcript']) +refs = list(ds["transcript_id"]) +for i in range(len(index)): + test[index[i]] = refs[i] + + +output = [] +for transcript in transcripts: + try: + output.append(">" + str(test[transcripts[transcript].id])) + output.append(str(transcripts[transcript].seq)) + except: + pass + +output_filename = snakemake.output[0] +output_file = open(output_filename,"w+") +output_file.write("\n".join(output)) +output_file.close() diff --git a/scripts/translation_to_protein.py b/scripts/translation_to_protein.py new file mode 100644 index 0000000..8d47e42 --- /dev/null +++ b/scripts/translation_to_protein.py @@ -0,0 +1,134 @@ +# Translation of transcript sequences to amino acid sequences + +from Bio import SeqIO +import os +import pandas as pd + +codon_table = { + 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', + 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', + 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', + 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', + 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', + 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', + 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', + 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', + 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', + 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', + 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', + 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', + 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', + 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', + 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', + 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', + } + +# Kozak scoring +def score(seq,start): + kozak = { + "A":[0.25,0.61,0.27,0.15,1.00,0.00,0.00,0.23], + "C":[0.53,0.02,0.49,0.55,0.00,0.00,0.00,0.16], + "G":[0.15,0.36,0.13,0.21,0.00,0.00,1.00,0.46], + "T":[0.07,0.01,0.11,0.09,0.00,1.00,0.00,0.15] + } + + score = 1.0 + for i in range(start,len(seq)): + score *= kozak[seq[i]][i] + return score + +# Translation +def translate(seq, i): + translating = True + aa = "" + + while(translating): + if (len(seq) < 3): + translating = False + aa += "*" + else: + codon = seq[0:3] + if (codon_table[codon] == "_"): + translating = False + else: + aa += codon_table[codon] + seq = seq[3:] + i += 3 + return aa,i + +# Translation scoring function +def translate_aa_seq_length(seq): + i = 0 + uorf_seq = "M" + uorf_sc = 0 + maybe = False + uorf_st = 0 + uorf_end = 0 + translating = True + uorfs_seq_frames = {0:"M",1:"M",2:"M"} + + while (translating): + if (i > len(seq)): + break + elif (seq[i:i+3] == "ATG"): + aa,end = translate(seq[i:], i) + sc = score(seq[i-4:i+4],0) + + if (aa in uorfs_seq_frames[i % 3]): + pass + elif (len(aa) < 15 * 3): + pass + elif (len(aa) > 15 * 10): + if (sc > uorf_sc): + return aa,i,end + if (len(aa) > 15 * 20): + return aa,i,end + if (maybe): + return uorf_seq,uorf_st,uorf_end + else: + return aa,i,end + elif (sc > uorf_sc): + uorf_seq = aa + uorf_sc = sc + uorf_st = i + uorf_end = end + maybe = True + + if (aa not in uorfs_seq_frames[i % 3]): + uorfs_seq_frames[i % 3] = aa + + i += 1 + return uorf_seq,uorf_st,uorf_end + +def find_all_aa_seqs(seq): + longest_aa_seq,start,end = translate_aa_seq_length(seq) + return longest_aa_seq,start,end + + +transcripts_filename = snakemake.input[0] +transcripts = SeqIO.index(transcripts_filename, "fasta") +output = [] + +gene = snakemake.params[0] +if(not os.path.isdir(snakemake.output[0])): + os.mkdir(snakemake.output[0]) + +# Translate each transcript seperatly and place in their own fasta file +transcript_stats = [] +for transcript in transcripts: + seq = str(transcripts[transcript].seq).strip() + enst = str(transcripts[transcript].id).split("|")[-1].strip() + protein,start,end = find_all_aa_seqs(seq) + + transcript_name = str(transcripts[transcript].id) + transcript_filename = transcript_name.split("|")[0] + transcript_filename = transcript_filename.replace("|","_") + transcript_filename = transcript_filename.replace("_","") + transcript_filename_path = str(snakemake.output[0] + "/%s_protein.fa" %(transcript_filename)) + transcript_file = open(transcript_filename_path, "w+") + transcript_file.write(">" + transcript_name + "\n" + protein + "\n") + transcript_file.close() + + transcript_stats.append([transcript_name.split("|")[0],start,end]) +transcript_stats_df = pd.DataFrame(transcript_stats, columns = ["transcript_id","start","end"]) +transcript_stats_df.to_csv(snakemake.output[1], index = False) diff --git a/scripts/visualization.py b/scripts/visualization.py new file mode 100644 index 0000000..ede3ea6 --- /dev/null +++ b/scripts/visualization.py @@ -0,0 +1,572 @@ +import copy +import string +import argparse +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle +from matplotlib.patches import Ellipse +from matplotlib.backends.backend_pdf import PdfPages +from matplotlib.transforms import Bbox +from matplotlib.collections import PatchCollection +import matplotlib.gridspec as gridspec +from visualization_helpers import * + + +gene = snakemake.params[0] +alph = string.ascii_uppercase +alph += "0123456789!@#$%^&*?;'{}-=+,.~_[]:<>`qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnmqwertyuiopasdfghjklzxcvbnm" + +#grey, orange, turquoise blue, blue green, yellow, royal blue, vermillion, red purple, +#pastel yellow, brown orange, burgundy, cerulean blue, dark purple, light purple, dark blue green +#pastel green, harvest brown, brown, sky blue, pink, dark pink +colors=["#999999", "#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7","#F0D648", + '#db6d00','#920000', '#6db6ff', '#b66dff', '#490092', '#ffff6d', '#b66dff', '#004949', + '#24C324', '#924900', '#9a6324', 'ff6db6', 'CC79A7'] +colors_ptm = ["#DB6D00","#920000","#B66DFF","#0072B2","#24C324"] +colors_ss3 = ['#FFC20A','#0C7BDC',"white"] +ss3_abbvs = ["H","E","C"] +colors_aa = ['#99BFFF', '#F56345', '#E0C830', '#8700F5', 'white'] +aa_abbvs = {"A":"H","C":"S","D":"N","E":"N","F":"H","G":"S","H":"P","I":"H","K":"P","L":"H","M":"H","N":"L","P":"H","Q":"L","R":"P","S":"L","T":"L","V":"H","W":"H","Y":"H"} +aa_groups = ["P","N","L","S","H"] + +# VISUALIZATION FUNCTIONS + +# change font +matplotlib.rcParams['font.sans-serif'] = "Arial" +matplotlib.rcParams['font.family'] = "sans-serif" +plt.rc('font', size=14) + +def plotAnnotation(annotation, transcripts_plot, colors, longest_length_protein, fig, gs, start_row_panel): + curr_row_panel = start_row_panel + + exon_dict = dict() + for tcons_curr in transcripts_plot: + transcript_annotation = annotation[annotation['transcript_id'] == tcons_curr] + + transcript_exon_start = 0 + transcript_exon_end = -1 + if (transcript_annotation.shape[0] > 0): + if (transcript_annotation.iloc[0]["strand"] == "-"): + transcript_annotation = transcript_annotation.iloc[::-1] + plot_start = copy.deepcopy(transcript_annotation["plot_start"]) + transcript_annotation["plot_start"] = transcript_annotation["plot_stop"] + transcript_annotation["plot_stop"] = plot_start + for idx,row in transcript_annotation.iterrows(): + if (row['type'] != 'transcript'): + exon_old_id = row["exon_number"] + start,stop = row['plot_start'],row['plot_stop'] + transcript_exon_start = transcript_exon_end + 1 + transcript_exon_end += abs(start - stop) + 1 + exon = Exon(tcons_curr, exon_old_id, start, stop) + + if (start in exon_dict): + exon_dict[start].append(exon) + else: + exon_dict[start] = [exon] + + if (stop in exon_dict): + exon_dict[stop].append(exon) + else: + exon_dict[stop] = [exon] + + all_exons = [] + curr_exons = [] + count = 0 + master = dict() + for key,value in sorted(exon_dict.items()): + end_exons = [] + for exon in value: + curr_exons.append(exon) if (exon not in curr_exons) else end_exons.append(exon) + end_exons.sort() + prev_start,prev_end = -1,-1 + for exon in end_exons: + if (prev_start != exon.start or prev_end != exon.stop): + count += 1 + exon.add_new_id(count) + all_exons.append(exon) + else: + exon.add_new_id(count) + prev_start,prev_end = exon.start,exon.stop + if (exon.tcons not in master): + master[exon.tcons] = dict() + master[exon.tcons][exon.old_id] = exon.new_id + + count = -1 + previous_start,previous_stop = -1,-1 + exon_group = [] + exon_distances = [] + for exon in all_exons: + if (exon.start > previous_stop): + exon_group.append([exon]) + exon_distances.append([exon.start,exon.stop]) + previous_start,previous_stop = exon.start,exon.stop + count += 1 + else: + exon_group[count].append(exon) + if (exon.stop > previous_stop): + previous_stop = exon.stop + exon_distances[count][1] = exon.stop + if (exon.start < previous_start): + previous_start = exon.start + exon_distances[count][0] = exon.start + + exon_distances_original = copy.deepcopy(exon_distances) + previous_stop = -1 + for exon_dist in exon_distances: + start,stop = exon_dist[0],exon_dist[1] + if ((start - previous_stop) > 200): + exon_dist[0] = previous_stop + 200 + exon_dist[1] = exon_dist[0] + stop - start + previous_stop = exon_dist[1] + + for exon_similar_group in range(len(exon_group)): + for exon in exon_group[exon_similar_group]: + start,stop = exon.start,exon.stop + if (exon.start != exon_distances[exon_similar_group][0]): + exon.start = exon_distances[exon_similar_group][0] + (exon.start - exon_distances_original[exon_similar_group][0]) + exon.stop = exon.start + (stop - start) + + for tcons_curr in transcripts_plot: + transcript_annotation = annotation[annotation['transcript_id'] == tcons_curr] + transcript_annotation["exon_number"] = transcript_annotation.apply(lambda x: master[tcons_curr][str(x["exon_number"])] if (x["type"] != "transcript") else 0, axis = 1) + transcript_exons = transcript_annotation.iloc[1:] + start = all_exons[transcript_exons.loc[transcript_exons['exon_number'].idxmin()]["exon_number"] - 1].start + stop = all_exons[transcript_exons.loc[transcript_exons['exon_number'].idxmax()]["exon_number"] - 1].stop + transcript_annotation["plot_start"] = transcript_annotation.apply(lambda x: all_exons[x["exon_number"] - 1].start if (x["type"] != "transcript") else start, axis = 1) + transcript_annotation["plot_stop"] = transcript_annotation.apply(lambda x: all_exons[x["exon_number"] - 1].stop if (x["type"] != "transcript") else stop, axis = 1) + annotation[annotation['transcript_id'] == tcons_curr] = transcript_annotation + + # Plot Transcript + longest = max(annotation.loc[annotation['plot_start'].idxmax()]["plot_start"],annotation.loc[annotation['plot_stop'].idxmax()]["plot_stop"]) + for tcons_curr in transcripts_plot: + transcript_color = colors[curr_row_panel - start_row_panel] + ax = fig.add_subplot(gs[curr_row_panel, :]) + ax.set_xlim((-10, longest+1)) + ax.set_ylim((0, 1.5)) + ax.text(0,1,tcons_curr,horizontalalignment = "left",verticalalignment = "top", fontsize = 14, color = "black") + + transcript_annotation = annotation[annotation['transcript_id'] == tcons_curr] + + transcript_info = transcript_annotation.iloc[0] + plt.plot([transcript_info["plot_start"],transcript_info["plot_stop"]],[0.5,0.5],color=transcript_color,linewidth=2) + + transcript_annotation = transcript_annotation.iloc[1:] + if (transcript_annotation.iloc[0]["strand"] == "-"): + transcript_annotation = transcript_annotation.iloc[::-1] + prev_intron_start = -1 + for idx,row in transcript_annotation.iterrows(): + exon_start,exon_stop = row['plot_start'],row['plot_stop'] + exon = alph[int(row["exon_number"]) - 1] + plt.plot([exon_start,exon_stop],[0.5,0.5],color = transcript_color,linewidth = 15) + ax.annotate(exon,(exon_start/2 + exon_stop/2,0.49), fontsize = 14, color = "white", weight='bold', ha = "center", va = "center") + + if (prev_intron_start > 0 and exon_start - prev_intron_start >= 200): + intron_shortening = [exon_distances[group - 1][1]/2 + exon_distances[group][0]/2 for group in range(len(exon_distances)) if (prev_intron_start <= exon_distances[group][0] and exon_start >= exon_distances[group][0])] + for intron in intron_shortening: + plt.plot([intron - 10,intron + 10],[0.5,0.5],color = "white",linewidth = 3) + plt.plot([intron - 20,intron - 10],[0.3,0.7],color = transcript_color,linewidth = 2) + plt.plot([intron + 10,intron + 20],[0.3,0.7],color = transcript_color,linewidth = 2) + prev_intron_start = exon_stop + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.set_yticks([], []) + ax.set_xticks([], []) + + curr_row_panel += 1 + +def plotExonLines(protein_exon_start,protein_exon_end,bottom,buffer_exon,exon_draw): + for feature_start,feature_stop in exon_draw: + plt.plot((protein_exon_start - 1,protein_exon_start - 1),(feature_start,feature_stop),color = "grey", linewidth = 2) + plt.plot((protein_exon_end - 1,protein_exon_end - 1),(feature_start,feature_stop),color = "grey", linewidth = 2) + plt.plot((protein_exon_start - 1,protein_exon_start - 1),(bottom,bottom + buffer_exon),color = "grey", linewidth = 2) + plt.plot((protein_exon_end - 1,protein_exon_end - 1),(bottom,bottom + buffer_exon),color = "grey", linewidth = 2) + +def plotSequenceAnalysis(transcripts_plot, colors, longest_length_protein, fig, gs, start_row_panel, annotation = None): + curr_row_panel = start_row_panel + + for isoform in range(len(transcripts_plot)): + tcons_curr = transcripts[transcripts_plot[isoform]] + + transcript_color = colors[isoform] + + buffer_prion = 1.0 + buffer_idr = 1.0 + buffer_domain = 0.4 + buffer_exon = 0.25 + buffer_aa = 0.3 + buffer_ss = 0.25 + buffer_phos = 0.075 + buffer_den = 0.5 + + lg = fig.add_subplot(gs[curr_row_panel:curr_row_panel+5, 0]) + lg.set_xlim((-0.11,0.11)) + + ax = fig.add_subplot(gs[curr_row_panel:curr_row_panel+5, 1:]) + ax.set_xlim((-0.5,longest_length_protein + 1)) + ax.patch.set_facecolor(transcript_color) + ax.patch.set_alpha(0.15) + + lg.set_xlabel('AA Position', fontsize = 14) + plt.plot((0,len(tcons_curr.aa)), (0,0),color = "black") + bottom = 0 + exon_draw = [] + + # Prion score + if (snakemake.config["prion"]): + plc_df = pd.DataFrame(tcons_curr.plc[1:], columns = tcons_curr.plc[0]) + plc_df = plc_df.astype({'AANUM': 'int32',"HMM.PrD-like":"float"}) + plc_df["AANUM"] = plc_df["AANUM"] - 0.5 + plt.plot(plc_df["AANUM"], bottom + plc_df["HMM.PrD-like"], color = transcript_color) + lg.annotate("Prion-Like", (0,bottom + buffer_prion/2), fontsize = 14, color = "black", ha = "center", va = "center") + plt.plot((0,len(tcons_curr.aa)), (bottom + 1,bottom + 1),color = "black") + bottom += 1.05 + exon_draw.append([bottom - 1.05,bottom]) + + # IDR region + if (snakemake.config["iupred2a"]): + idr_df = pd.DataFrame(tcons_curr.idr[1:], columns = tcons_curr.idr[0]) + idr_df = idr_df.astype({'# POS': 'int32',"IUPRED2":"float"}) + idr_df["# POS"] = idr_df["# POS"] - 0.5 + plt.plot(idr_df["# POS"], bottom + idr_df["IUPRED2"], color = transcript_color) + lg.annotate("Disordered",(0,bottom + buffer_idr * 0.65), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + lg.annotate("Region",(0,bottom + buffer_idr * 0.35), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + lg.plot((-0.1,0.1), (bottom + 1,bottom + 1),color = "black") + bottom += 1.05 + exon_draw.append([bottom - 1.05,bottom]) + + # Protein domain + r = fig.canvas.get_renderer() + if (snakemake.config["pfam"]): + for i in tcons_curr.dom: + start,stop,name = (int(i[0]) - 1),(int(i[1]) - 1),i[2].strip() + name = name.replace(" domain","") + plt.gca().add_patch(Rectangle((start, bottom), stop - start, buffer_domain, edgecolor = transcript_color, facecolor = transcript_color, alpha = 0.5)) + text = ax.annotate(name, (start/2.0 + stop/2.0, bottom + buffer_domain/2), fontsize = 14, color = "black", weight='bold', ha = "center", va = "center") + bbtext = Bbox(ax.transData.inverted().transform(text.get_window_extent(renderer = r))) + if (bbtext.x0 < start and bbtext.x1 > stop): + text.set_visible(False) + name_split = name.split(" ") + name_top = " ".join(name_split[:int((len(name_split)/2))]) + name_bottom = " ".join(name_split[int((len(name_split)/2)):]) + ax.annotate(name_top, (start/2.0 + stop/2.0, bottom + buffer_domain * 3/4), fontsize = 14, color = "black", weight='bold', ha = "center", va = "center") + ax.annotate(name_bottom, (start/2.0 + stop/2.0, bottom + buffer_domain * 1/4), fontsize = 14, color = "black", weight='bold', ha = "center", va = "center") + lg.annotate("Domain", (0, bottom + buffer_domain/2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + bottom += 0.5 + + # Secondary structure prediction + if (snakemake.config["porter"]): + ss3_df = pd.DataFrame(tcons_curr.ss3[1:], columns = tcons_curr.ss3[0]) + ss = list(ss3_df["SS"]) + for i in range(len(tcons_curr.aa)): + plt.gca().add_patch(Rectangle((i,bottom),1,buffer_ss,edgecolor=colors_ss3[ss3_abbvs.index(ss[i])],facecolor=colors_ss3[ss3_abbvs.index(ss[i])])) + lg.annotate("Structure",(0,bottom + buffer_ss/2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + bottom += 0.4 + + # Amino acid sequence + if (snakemake.config["aa"]): + for i in range(len(tcons_curr.aa)): + plt.gca().add_patch(Rectangle((i,bottom),1,buffer_aa, edgecolor=colors_aa[aa_groups.index(aa_abbvs[tcons_curr.aa[i]])],facecolor=colors_aa[aa_groups.index(aa_abbvs[tcons_curr.aa[i]])])) + lg.annotate("Amino Acid",(0,bottom + buffer_aa/2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + bottom += 0.4 + + # Phosphorlyation Site + if (snakemake.config["pfScan"]): + row_phos = 0 + sites_focus = ["AMIDATION","MYRISTYL","GLYCOSYLATION","HYDROXYL","PHOSPHO_SITE"] + + plt.plot([0,len(tcons_curr.aa)],[bottom + 0.1,bottom + 0.1], linewidth = 4, color = "#919191") + for site_type in sites_focus: + if (site_type in tcons_curr.pss): + for i in tcons_curr.pss[site_type]: + start,stop = (int(i[0]) - 1),(int(i[1]) - 1) + mid = start/2 + stop/2 + radius_x = longest_length_protein/150 + 1 + radius_y = -longest_length_protein/8000 + 0.22 + patches = [Rectangle((mid - 0.5,bottom + 0.1),1,0.2,facecolor=colors_ptm[row_phos])] + patches.append(Ellipse((mid,bottom + 0.3),radius_x,radius_y,facecolor=colors_ptm[row_phos])) + p = PatchCollection(patches) + p.set_color(colors_ptm[row_phos]) + ax.add_collection(p) + row_phos += 1 + + lg.annotate("PTM",(0,bottom + row_phos * buffer_phos/2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + bottom += 0.45 + + # Phosphorlyation Site Density + density = [0.0] * len(tcons_curr.aa) + for site_type in tcons_curr.pss: + for i in tcons_curr.pss[site_type]: + start,stop = (int(i[0]) - 1),(int(i[1]) - 1) + hamming_values = list(np.hamming(stop - start + 1 + 10)) + for j in range(max(0,start - 5),min(stop + 1 + 5,len(tcons_curr.aa))): + density[j] += hamming_values[j - max(0,start - 5)] + density = pd.DataFrame(density) + density = (density/density.max()) * 0.5 + density_x = [i + 0.5 for i in range(len(tcons_curr.aa))] + plt.plot(density_x, density + bottom, color = transcript_color) + lg.annotate("PTM Density",(0,bottom + buffer_den/2), fontsize = 13, color = "black", ha = "center", va = "center", weight='bold') + bottom += 0.55 + exon_draw.append([bottom - 0.6,bottom]) + + # Exons + if (snakemake.config["annotation"]): + transcript_annotation = annotation[annotation['transcript_id'] == transcripts_plot[isoform]] + transcript_stats = transcript_stats_df[transcript_stats_df["transcript_id"] == transcripts_plot[isoform]] + translate = False + protein_start,protein_end = int(transcript_stats["start"]),int(transcript_stats["end"]) + transcript_exon_start = 0 + transcript_exon_end = -1 + protein_exon_start = 0 + protein_exon_end = 0 + if (transcript_annotation.shape[0] > 0): + if (transcript_annotation.iloc[0]["strand"] == "-"): + transcript_annotation = transcript_annotation.iloc[::-1] + for idx,row in transcript_annotation.iterrows(): + if (row['type'] != 'transcript'): + start = row['plot_start'] + stop = row['plot_stop'] + transcript_exon_start = transcript_exon_end + 1 + transcript_exon_end += abs(start - stop) + 1 + exon = alph[int(row["exon_number"]) - 1] + + if ((transcript_exon_start <= protein_end) and (protein_end <= transcript_exon_end)): + translate = False + protein_exon_start = protein_exon_end + protein_exon_end = (protein_end - protein_start)/3 + plotExonLines(protein_exon_start,protein_exon_end,bottom,buffer_exon,exon_draw) + ax.annotate(exon,(protein_exon_start/2 + protein_exon_end/2 - 1,bottom + buffer_exon/2), fontsize = 14, color = "black", weight='bold', ha = "center", va = "center") + break + + if (translate): + protein_exon_start = protein_exon_end + protein_exon_end += abs(start - stop)/3 + plotExonLines(protein_exon_start,protein_exon_end,bottom,buffer_exon,exon_draw) + ax.annotate(exon,(protein_exon_start/2 + protein_exon_end/2 - 1,bottom + buffer_exon/2), fontsize = 14, color = "black", weight='bold', ha = "center", va = "center") + protein_exon_start = protein_exon_end + + if ((transcript_exon_start <= protein_start) and (protein_start <= transcript_exon_end)): + translate = True + protein_exon_start = 0 + protein_exon_end = abs(transcript_exon_end - protein_start)/3 + plotExonLines(protein_exon_start,protein_exon_end,bottom,buffer_exon,exon_draw) + ax.annotate(exon,(protein_exon_start/2 + protein_exon_end/2 - 1,bottom + buffer_exon/2), fontsize = 14, color = "black", weight='bold', ha = "center", va = "center") + lg.annotate("Exon",(0,bottom + buffer_exon/2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + bottom += 0.25 + + plt.gca().add_patch(Rectangle((len(tcons_curr.aa), 0), longest_length_protein - len(tcons_curr.aa) + 1, bottom, edgecolor = "white", facecolor = "white")) + + plt.plot((0,len(tcons_curr.aa)), (bottom,bottom),color = "black") + ax.set_title(tcons_curr.tcons + str("***" if tcons_curr.ns_decay else ""), loc = "left", fontsize = 14) + # Finish Formatting + lg.set_ylim((0, bottom)) + lg.spines['top'].set_visible(False) + lg.spines['right'].set_visible(False) + lg.spines['left'].set_visible(False) + #lg.spines['bottom'].set_visible(False) + lg.set_xticks([], []) + lg.set_yticks([], []) + + ax.set_ylim((0, bottom)) + plt.plot((0,0),(0,bottom),color = "black") + plt.plot((len(tcons_curr.aa),len(tcons_curr.aa)),(0,bottom),color = "black") + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + #ax.spines['bottom'].set_visible(False) + ax.set_yticks([], []) + curr_row_panel += total_features + +def plotLegend(plt,fig,gs,total_gs): + ax = fig.add_subplot(gs[total_gs-2:total_gs, 0:5]) + ax.set_xlim((0, 13)) + ax.set_ylim((0, 2)) + mods_abbvs = ["Amidation","Myristoylation","Glycoslyation","Hydroxylation","Phosphorlyation"] + for i in range(0,len(mods_abbvs)): + plt.gca().add_patch(Rectangle((3,float(i*2)/len(mods_abbvs)),3,1.0/len(mods_abbvs),edgecolor=colors_ptm[i],facecolor=colors_ptm[i])) + ax.text(7,float(i*2)/len(mods_abbvs) + 0.2,mods_abbvs[i],horizontalalignment = "left",verticalalignment = "top") + ax.annotate("Post-translation Modification (PTM)",(6.75,2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + ax.axis("off") + + ax = fig.add_subplot(gs[total_gs-2:total_gs, 5:10]) + ax.set_xlim((0, 13)) + ax.set_ylim((0, 2)) + ss_abbvs = ["Alpha Helix","Beta Sheet","Other"] + for i in range(0,len(ss_abbvs)): + if (ss_abbvs[i] == "Other"): + plt.gca().add_patch(Rectangle((3,float(i*2)/5),3,1.0/5,edgecolor="black",facecolor=colors_ss3[i])) + else: + plt.gca().add_patch(Rectangle((3,float(i*2)/5),3,1.0/5,edgecolor=colors_ss3[i],facecolor=colors_ss3[i])) + ax.text(7,float(i*2)/5 + 0.25,ss_abbvs[i],horizontalalignment = "left",verticalalignment = "top") + ax.annotate("Secondary Structure",(6.75,2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + ax.axis("off") + + ax = fig.add_subplot(gs[total_gs-2:total_gs, 10:15]) + ax.set_xlim((0, 13)) + ax.set_ylim((0, 2)) + aa_abbvs = ["Positively Charged","Negatively Charged","Polar","Special","Hydrophobic"] + for i in range(0,len(aa_abbvs)): + if (aa_abbvs[i] == "Hydrophobic"): + plt.gca().add_patch(Rectangle((3,float(i*2)/len(aa_abbvs)),3,1.0/len(aa_abbvs),edgecolor="black",facecolor=colors_aa[i])) + else: + plt.gca().add_patch(Rectangle((3,float(i*2)/len(aa_abbvs)),3,1.0/len(aa_abbvs),edgecolor=colors_aa[i],facecolor=colors_aa[i])) + ax.text(7,float(i*2)/len(aa_abbvs) + 0.2,aa_abbvs[i],horizontalalignment = "left",verticalalignment = "top") + ax.annotate("Amino Acid",(6.75,2), fontsize = 14, color = "black", ha = "center", va = "center", weight='bold') + ax.axis("off") + +######## Actual Code + +gene_functional_analysis_file = open(snakemake.input[0], "r") +lines = gene_functional_analysis_file.readlines() +gene_functional_analysis_file.close() +transcript_stats_df = pd.read_csv(snakemake.input[1]) +transcript_stats_df["transcript_id"] = transcript_stats_df["transcript_id"].apply(lambda x: x.replace("_","")) + +transcripts = dict() +analysis = {"aas":False,"idr":False,"dom":False,"ss3":False,"plc":False,"pss":False} +aa_lines = "" + +for line in lines: + if (line.startswith(">")): + if (len(aa_lines) > 0): + transcripts[tcons] = transcript(tcons, ns_decay, aa_lines, idr_lines, dom_lines, ss3_lines, plc_lines, pss_lines) + tcons = line.strip()[1:].replace("_","") + aa_lines = [] + idr_lines = [] + dom_lines = [] + ss3_lines = [] + plc_lines = [] + ns_decay = False + pss_lines = {"PHOSPHO_SITE":[],"GLYCOSYLATION":[],"MYRISTYL":[],"HYDROXYL":[],"AMIDATION":[],"OTHER":[]} + analysis = {"aas":False,"idr":False,"dom":False,"ss3":False,"plc":False,"pss":False} + + elif(line.startswith("#####Prion Analysis")): + analysis["plc"] = True + elif(line.startswith("#####Prosite Analysis")): + analysis["pss"] = True + elif(line.startswith("#####SS Analysis")): + analysis["ss3"] = True + elif(line.startswith("#####Domain Analysis")): + analysis["dom"] = True + elif(line.startswith("#####IUPred2A Analysis")): + analysis["idr"] = True + elif(line.startswith("#####AA Sequence")): + analysis["aas"] = True + + elif(analysis["plc"]): + plc_lines.append(line.strip().split("\t")) + elif(analysis["pss"]): + pss_id = line.strip().split("\t")[2] + if ("GLYCOSYLATION" in pss_id): pss_lines["GLYCOSYLATION"].append(line.strip().split("\t")) + elif ("PHOSPHO_SITE" in pss_id): pss_lines["PHOSPHO_SITE"].append(line.strip().split("\t")) + elif ("MYRISTYL" in pss_id): pss_lines["MYRISTYL"].append(line.strip().split("\t")) + elif ("HYDROXYL" in pss_id): pss_lines["HYDROXYL"].append(line.strip().split("\t")) + elif ("AMIDATION" in pss_id): pss_lines["AMIDATION"].append(line.strip().split("\t")) + else: pss_lines["OTHER"].append(line.strip().split("\t")) + elif(analysis["ss3"]): + ss3_lines.append(line.strip().split("\t")) + elif(analysis["dom"]): + dom_lines.append(line.strip().split("\t")) + elif(analysis["idr"]): + idr_lines.append(line.strip().split("\t")) + elif(analysis["aas"]): + aa_lines = line.strip() + if (aa_lines[-1] == "*"): + ns_decay = True + aa_lines = aa_lines[:-1] + +transcripts[tcons] = transcript(tcons, ns_decay, aa_lines, idr_lines, dom_lines, ss3_lines, plc_lines, pss_lines) +total_features = 0 +for feature in analysis: + if analysis[feature]: + total_features += 1 +total_features += 1 + +longest_length = 0 +longest_tcon = "" +for ids in transcripts: + if (len(transcripts[ids].aa) > longest_length): + longest_length = len(transcripts[ids].aa) + longest_tcon = ids + +counts_data_filename = snakemake.input[2] if snakemake.config["quantification"] else None +annotation_filename = snakemake.input[3] if snakemake.config["annotation"] else None +args = Parser(counts_data_filename,annotation_filename,snakemake.config["samples"]) +continuous = snakemake.config["continuous"] +minimumTPM = snakemake.config["minIsoTPM"] +maximumIso = snakemake.config["maxIsoNum"] +minimumPct = snakemake.config["minIsoPct"] +(annotation, data, samples, conditions, number_replicates) = preprocessArguments(args) + + +total_gs = 1 +transcripts_plot = sorted(list(transcripts.keys())) +if (snakemake.config["quantification"]): + df_temp = data[data["gene_name"] == gene] + data_gene = df_temp[samples].sum().to_frame().transpose() + data_gene = calculateStatistics(data_gene,conditions,number_replicates) + df_temp = calculateStatistics(df_temp,conditions,number_replicates) + df_temp = (df_temp.filter(like='mean').div(data_gene.filter(like='mean').values[0],1)*100).add_prefix('Pct_').join(df_temp) + df_temp = chooseIsoforms2Plot(df_temp,minimumTPM,minimumPct,maximumIso) + df_temp["transcript_id"] = df_temp["transcript_id"].apply(lambda x: x.replace("_","")) + transcripts_plot = list(pd.unique(df_temp["transcript_id"])) + total_gs += 4 + +if (snakemake.config["annotation"]): + transcripts_delete = [] + for transcript_id in transcripts_plot: + transcript_annotation = annotation[annotation['transcript_id']==transcript_id] + if ((transcript_annotation.shape[0] < 3)): + transcripts_delete.append(transcript_id.replace("_","")) + for transcript_id in transcripts_delete: + transcripts_plot.remove(transcript_id) + total_gs += len(transcripts_plot) + 1 +total_gs += total_features * len(transcripts_plot) + 2 +fig = plt.figure(figsize = (21,total_gs)) +gs = fig.add_gridspec(total_gs, 15) + +fig.tight_layout(pad = 5.0) +fig.subplots_adjust(top = 0.95,bottom = 0.05,left = 0.05, right = 0.95) +fig.suptitle(gene,fontsize=30) + +total_gs = 1 +if (snakemake.config["quantification"]): + #plot total expression + axg = fig.add_subplot(gs[0:4, 0:4]) + plotTotal(conditions, data_gene, axg, colors, number_replicates, continuous) + total_gs += 4 + +if (len(transcripts_plot) > 0): + if (snakemake.config["quantification"]): + df_temp = df_temp[df_temp["transcript_id"].isin(transcripts_plot)] + + #plot isoform expression + axg = fig.add_subplot(gs[0:4, 5:10]) + plotIsoformProfiles(conditions, df_temp, axg, colors, continuous) + + #plot isoform expression percentage + axg = fig.add_subplot(gs[0:4, 11:15]) + plotIsoformStacked(conditions, df_temp, axg, colors, continuous) + + if (snakemake.config["annotation"]): + #prepare annotation + annotation_cut = prepareAnnotation(annotation,transcripts_plot) + + #plot annotation + plotAnnotation(annotation_cut, transcripts_plot, colors, longest_length, fig, gs, total_gs) + total_gs += len(transcripts_plot) + 1 + + plotSequenceAnalysis(transcripts_plot, colors, longest_length, fig, gs, total_gs, annotation_cut) + else: + plotSequenceAnalysis(transcripts_plot, colors, longest_length, fig, gs, total_gs) + + total_gs += total_features * len(transcripts_plot) + 2 + plotLegend(plt,fig,gs,total_gs) + +plt.subplots_adjust(wspace=0.05, hspace=0.05) +fig.savefig(snakemake.output[0]) diff --git a/scripts/visualization_helpers.py b/scripts/visualization_helpers.py new file mode 100644 index 0000000..d496493 --- /dev/null +++ b/scripts/visualization_helpers.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 + +import string +import pandas as pd +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +# change font +matplotlib.rcParams['font.sans-serif'] = "Arial" +matplotlib.rcParams['font.family'] = "sans-serif" +plt.rc('font', size=14) + +alph = string.ascii_lowercase +alph += "0123456789!@#$%^&*?;'{}-=+,.~_[]:<>`qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnmqwertyuiopasdfghjklzxcvbnm" + +class transcript: + def __init__(self, tcons, ns_decay, aa, idr = [], dom = [], ss3 = [], plc = [], pss = []): + self.tcons = tcons + self.ns_decay = ns_decay + self.aa = aa + self.idr = idr + self.dom = dom + self.ss3 = ss3 + self.plc = plc + self.pss = pss + +class Exon: + def __init__(self, tcons, old_id, start, stop, new_id = ""): + self.tcons = tcons + self.old_id = old_id + self.new_id = new_id + self.start = start + self.stop = stop + + def __lt__(self, other): + return self.start < other.start + + def add_new_id(self, id): + self.new_id = id + + def __str__(self): + return str(self.new_id) + " " + str(self.start) + " " + str(self.stop) + +class Parser(object): + def __init__(self, csv, gtf, samples): + self.csv = csv + self.gtf = gtf + self.samples = samples + +def preprocessArguments(args): + if (args.gtf == None): + annotation = None + else: + annotation = pd.read_csv(args.gtf, delimiter='\t', header=None, usecols=[0,2,3,4,6,8],names=['chrm','type','start','stop','strand','more']) + annotation['transcript_id'] = annotation.apply(lambda x: x['more'].split('transcript_id "')[1].split('"')[0],1) + annotation["transcript_id"] = annotation["transcript_id"].apply(lambda x: x.replace("_","")) + annotation["exon_number"] = annotation.apply(lambda x: x["more"].split('exon_number "')[1].split('"')[0] if "exon_number" in x["more"] else 0,1) + annotation = annotation.drop(columns='more') + + if (args.csv == None): + data = None + else: + data = pd.read_csv(args.csv) + + samples = [sample for sample in args.samples] + conditions = sorted(list(set([sample.split("_")[0] for sample in samples]))) + number_replicates={} + for cond in conditions: + number_replicates[cond] = len([sample for sample in samples if sample.startswith(cond)]) + return annotation, data, samples, conditions, number_replicates + +def calculateStatistics(df,conds,nreps): + for cond in conds: + df['mean'+cond]=df.filter(like=cond+'_').mean(1) + df['stdn'+cond]=df.filter(like=cond+'_').std(1)/np.sqrt(nreps[cond]) + df=df.sort_index() + return df + +def chooseIsoforms2Plot(df,minTPM,minPct,maxIso): + df['minimum']=df.filter(regex='^mean').min(axis=1) + df['maximumPct']=df.filter(regex='^Pct').max(axis=1) + df=df[df['maximumPct']>minPct] + df['maximum']=df.filter(regex='^mean').max(axis=1) + df=df[df['maximum']>minTPM] + df['avg'] = df.filter(regex='^Pct').mean(axis=1) + df=df.sort_values('avg',ascending=False) + df=df.head(maxIso) + return df + +def plotTotal(conditions, df_gene, ax, colors, number_replicates, continuous): + conditions_x = [i + 1 for i in range(len(conditions))] + if (continuous): + plt.errorbar(conditions_x, df_gene.filter(like = 'mean').iloc[0], yerr = df_gene.filter(like='stdn').iloc[0], color = '#FA3D36', linewidth = 3, label = "Total Expression", markersize=6) + else: + x = [] + boxes = [] + count = 0 + for i in range(len(conditions)): + x += [i + 1] * number_replicates[conditions[i]] + boxes.append(df_gene.filter(like = '_').values[0][count:count+number_replicates[conditions[i]]]) + count += number_replicates[conditions[i]] + bp = plt.boxplot(boxes, patch_artist=True, showfliers=False, zorder=1) + for patch in bp['boxes']: + patch.set_facecolor("#F7F7F7") + for median in bp['medians']: + median.set(color ='black',linewidth = 1) + + plt.scatter(x + np.random.normal(0, 0.075, len(x)), df_gene.filter(like = '_'), color = "black", marker = '.', s = 200, alpha = 0.5, linewidths = 0, zorder=2) + + ax.set_ylim(bottom = -0.5) + ax.set_xticks(conditions_x) + ax.set_xticklabels(conditions) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + #ax.spines['left'].set_bounds(0,ax.get_yticks()[-2]) + ax.spines['bottom'].set_bounds(min(conditions_x),max(conditions_x)) + ax.tick_params(axis = "both", which = "major", labelsize = 14) + plt.xlabel("Condition") + plt.ylabel("Expression [TPM]") + #plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False) + ax.set_title("Total Gene Expression", loc = "center", fontsize = 14, weight='bold') + +def plotIsoformProfiles(conditions, df, ax, colors, continuous): + x = [i + 1 for i in range(len(conditions))] + last_bottom = [0] * len(conditions) + for j in range(df.shape[0]): + row = df.iloc[j] + if (continuous): + plt.errorbar(x + np.random.normal(0, 0.03, len(x)), row.filter(regex='^mean'),yerr=row.filter(like='stdn'),color=colors[j],linewidth=2, label = "") + else: + mean = row.filter(regex = '^mean') + err = row.filter(like='stdn') + jitter = 0.5 * (j + 1)/(df.shape[0] + 1) - 0.25 + plt.bar(x, mean, width = 0.5, bottom = last_bottom, color = colors[j]) + for i in x: + plt.plot([i + jitter,i + jitter],[last_bottom[i - 1] + mean.iloc[i - 1] - err.iloc[i - 1],last_bottom[i - 1] + mean.iloc[i - 1] + err.iloc[i - 1]],linewidth = 2,color = "black") + last_bottom += mean + + ax.set_ylim(bottom=-0.5) + ax.set_xticks(x) + ax.set_xticklabels(conditions) + ax.tick_params(axis = "both", which = "major", labelsize = 14) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + #ax.spines['left'].set_bounds(0,ax.get_yticks()[-2]) + ax.spines['bottom'].set_bounds(min(x),max(x)) + plt.xlabel("Condition", fontsize = 14) + plt.ylabel("Expression [TPM]", fontsize = 14) + plt.legend(loc = "upper center", bbox_to_anchor=(0.5,1), frameon=False) + ax.set_title("Individual Isoform Expression", loc = "center", fontsize = 14, weight='bold') + +def plotIsoformStacked(conditions, df, ax, colors, continuous): + x = [i + 1 for i in range(len(conditions))] + if (continuous): + plt.stackplot(x,df.filter(regex='^Pct').values,colors=colors[:df.shape[0]]) + else: + x = [i + 1 for i in range(len(conditions))] + last_bottom = [0] * len(conditions) + for j in range(df.shape[0]): + row = df.iloc[j] + pct = row.filter(regex = '^Pct') + #err = row.filter(like='stdn') + err = np.divide(np.multiply(row.filter(like='stdn').values,row.filter(regex = '^Pct').values),row.filter(regex = '^mean').values) + jitter = (j - df.shape[0]//2)/10 + plt.bar(x, pct, width = 0.5, bottom = last_bottom, color=colors[j]) + #for i in x: + # plt.plot([i + jitter,i + jitter],[last_bottom[i - 1] + pct.iloc[i - 1] - err[i - 1],last_bottom[i - 1] + pct.iloc[i - 1] + err[i - 1]],linewidth = 2,color = "black") + last_bottom += pct + + ax.set_xticks(x) + ax.set_xticklabels(conditions) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_bounds(0,100) + ax.spines['bottom'].set_bounds(min(x),max(x)) + ax.axis(top = [-0.5,100]) + ax.set_yticks([i for i in range(0,101,20)]) + ax.tick_params(axis = "both", which = "major", labelsize = 14) + plt.xlabel("Condition") + plt.ylabel("Expression [%]") + ax.set_title("Individual Isoform Usage", loc = "center", fontsize = 14, weight='bold') + +def prepareAnnotation(annotation,transcripts_plot): + cut = annotation[annotation['transcript_id'].isin(transcripts_plot)] + strand = cut.iloc[0]['strand'] + if (strand == '+'): + start = cut['start'].min() + cut['plot_start'] = cut['start'] - start + cut['plot_stop'] = cut['stop'] - start + else: + start = cut['stop'].max() + cut['plot_start'] = (cut['start'] - start) * (-1) + cut['plot_stop'] = (cut['stop'] - start) * (-1) + return cut