Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
isoform_differentiation/Snakefile
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
211 lines (186 sloc)
8.68 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
configfile: "config.yaml" | |
gene_file = config["gene_file"] | |
GENES = pd.read_table(gene_file)["gene_symbol"] | |
NanoporeGTF = config["nanopore_gtf"] | |
Transcripts = config["polished_reads"] | |
rule all: | |
input: | |
#expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf", gene = GENES), | |
#expand("//project/owlmayerTemporary/Sid/isoform_analysis//result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES) | |
expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt", gene = GENES) | |
rule gene_transcript: | |
input: | |
NanoporeGTF, | |
Transcripts | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa" | |
params: | |
gene = "{gene}" | |
script: | |
"gene_transcripts.py" | |
rule blastx: | |
input: | |
gene_fa = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa", | |
db = config["human_protein"] | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out" | |
threads: | |
4 | |
shell: | |
"/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query {input.gene_fa} -db {input.db} -out {output} -evalue 1e-5 -max_target_seqs 1 -max_hsps 1 -outfmt '6 qseqid sseqid evalue' -num_threads {threads} -soft_masking false" | |
rule protein_sequence: | |
input: | |
blastx = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out", | |
db = config["human_protein"] | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx_protein.fa" | |
shell: | |
"sh protein_transcript_sequences.sh {input.blastx} {input.db} {output}" | |
rule utr_regions: | |
input: | |
config["utr_regions"] | |
params: | |
gene = "{gene}" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed" | |
script: | |
"utr_regions.py" | |
rule utr_sequences: | |
input: | |
hg = config["human_genome"], | |
utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa" | |
shell: | |
"bedtools getfasta -fi {input.hg} -bed {input.utr} -fo {output} -name" | |
#rule transcript_filter_utr: | |
# input: | |
# utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa", | |
# seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa" | |
# output: | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq_filt.fa" | |
# script: | |
# "filter_utr.py" | |
checkpoint mapping: | |
input: | |
utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa", | |
seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa" | |
output: | |
directory("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa" | |
params: | |
gene = "{gene}" | |
script: | |
"translation_protein.py" | |
def aggregate_mapping(wildcards): | |
checkpoint_output = checkpoints.mapping.get(**wildcards).output[0] | |
return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa", | |
gene=wildcards.gene, | |
transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript) | |
rule aggregate_mapping: | |
input: | |
aggregate_mapping | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt" | |
params: | |
gene = "{gene}" | |
shell: | |
"cat {input} > {output}" | |
rule iupred2a_analysis: | |
input: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa", | |
config["iupred2a"] | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_transcript_sequence.txt", | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt" | |
script: | |
"iupred2a_analysis.py" | |
rule interpro_scan: | |
input: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3" | |
params: | |
db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite,SMART" | |
shell: | |
"sh /home/annaldas/my_interproscan/interproscan-5.38-76.0/interproscan.sh -i {input} -o {output} -f GFF3 -appl {params.db} -dra" | |
rule brewery_analysis: | |
input: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3" | |
#"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8" | |
params: | |
brewery = config["brewery"] | |
shell: | |
"python3 {params.brewery} -i {input} --cpu 32 --noTA --noSA --noCD" | |
rule functional_site_analysis: | |
input: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" | |
params: | |
ps_scan = config["prosite_scan"], | |
prosite_dat = config["prosite_dat"] | |
shell: | |
"perl {params.ps_scan} -d {params.prosite_dat} {input} > {output}" | |
rule individual_transcript_analysis: | |
input: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt", | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3", | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3", | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt" | |
script: | |
"transcript_analysis.py" | |
def aggregate_input(wildcards): | |
checkpoint_output = checkpoints.mapping.get(**wildcards).output[0] | |
return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt", | |
gene=wildcards.gene, | |
transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript) | |
rule aggregate: | |
input: | |
aggregate_input | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt" | |
params: | |
gene = "{gene}" | |
shell: | |
"cat {input} > {output}" | |
#rule filter_transcripts: | |
# input: | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_analysis.txt" | |
# output: | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt" | |
# script: | |
# "filter_transcripts.py" | |
rule protein_coding_potential_analysis: | |
input: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt" | |
output: | |
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf" | |
params: | |
gene = "{gene}" | |
script: | |
"interproscan_analysis.py" | |
#rule protein_domain_analysis: | |
# input: | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3", | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt" | |
# output: | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_map_protein_analysis.txt" | |
# params: | |
# gene = "{gene}" | |
# script: | |
# "interproscan_analysis.py" | |
#rule sashimi_plot: | |
# input: | |
# sashimi_sh = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.sh", | |
# sashimi_py = config["sashimi"], | |
# bams = config["input_bams"], | |
# gtf = NanoporeGTF | |
# output: | |
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.pdf" | |
# shell: | |
# "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}" |