Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import pandas as pd
configfile: "config.yaml"
gene_file = config["gene_file"]
GENES = pd.read_table(gene_file)["gene_symbol"]
NanoporeGTF = config["nanopore_gtf"]
Transcripts = config["polished_reads"]
rule all:
input:
#expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf", gene = GENES),
#expand("//project/owlmayerTemporary/Sid/isoform_analysis//result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES)
expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt", gene = GENES)
rule gene_transcript:
input:
NanoporeGTF,
Transcripts
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa"
params:
gene = "{gene}"
script:
"gene_transcripts.py"
rule blastx:
input:
gene_fa = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa",
db = config["human_protein"]
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out"
threads:
4
shell:
"/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query {input.gene_fa} -db {input.db} -out {output} -evalue 1e-5 -max_target_seqs 1 -max_hsps 1 -outfmt '6 qseqid sseqid evalue' -num_threads {threads} -soft_masking false"
rule protein_sequence:
input:
blastx = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx.out",
db = config["human_protein"]
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_blastx_protein.fa"
shell:
"sh protein_transcript_sequences.sh {input.blastx} {input.db} {output}"
rule utr_regions:
input:
config["utr_regions"]
params:
gene = "{gene}"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed"
script:
"utr_regions.py"
rule utr_sequences:
input:
hg = config["human_genome"],
utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.bed"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa"
shell:
"bedtools getfasta -fi {input.hg} -bed {input.utr} -fo {output} -name"
#rule transcript_filter_utr:
# input:
# utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa",
# seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa"
# output:
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq_filt.fa"
# script:
# "filter_utr.py"
checkpoint mapping:
input:
utr = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_utr_regions.fa",
seq = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_seq.fa"
output:
directory("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts") #/{transcript}/{transcript}_map_protein.fa"
params:
gene = "{gene}"
script:
"translation_protein.py"
def aggregate_mapping(wildcards):
checkpoint_output = checkpoints.mapping.get(**wildcards).output[0]
return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa",
gene=wildcards.gene,
transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript)
rule aggregate_mapping:
input:
aggregate_mapping
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/all/{gene}_protein_sequences.txt"
params:
gene = "{gene}"
shell:
"cat {input} > {output}"
rule iupred2a_analysis:
input:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa",
config["iupred2a"]
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_transcript_sequence.txt",
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt"
script:
"iupred2a_analysis.py"
rule interpro_scan:
input:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3"
params:
db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite,SMART"
shell:
"sh /home/annaldas/my_interproscan/interproscan-5.38-76.0/interproscan.sh -i {input} -o {output} -f GFF3 -appl {params.db} -dra"
rule brewery_analysis:
input:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3"
#"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss8"
params:
brewery = config["brewery"]
shell:
"python3 {params.brewery} -i {input} --cpu 32 --noTA --noSA --noCD"
rule functional_site_analysis:
input:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt"
params:
ps_scan = config["prosite_scan"],
prosite_dat = config["prosite_dat"]
shell:
"perl {params.ps_scan} -d {params.prosite_dat} {input} > {output}"
rule individual_transcript_analysis:
input:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_iupred2a.txt",
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map.gff3",
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein.fa.ss3",
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_sites.txt"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt"
script:
"transcript_analysis.py"
def aggregate_input(wildcards):
checkpoint_output = checkpoints.mapping.get(**wildcards).output[0]
return expand("/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}_map_protein_analysis.txt",
gene=wildcards.gene,
transcript=glob_wildcards(os.path.join(checkpoint_output,"{transcript}_map_protein.fa")).transcript)
rule aggregate:
input:
aggregate_input
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
params:
gene = "{gene}"
shell:
"cat {input} > {output}"
#rule filter_transcripts:
# input:
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_analysis.txt"
# output:
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
# script:
# "filter_transcripts.py"
rule protein_coding_potential_analysis:
input:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_analysis.txt"
output:
"/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_transcripts_filtered_coding_potential_analysis.pdf"
params:
gene = "{gene}"
script:
"interproscan_analysis.py"
#rule protein_domain_analysis:
# input:
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map.gff3",
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/transcripts/{transcript}/{transcript}_map_protein_iupred2a.txt"
# output:
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_map_protein_analysis.txt"
# params:
# gene = "{gene}"
# script:
# "interproscan_analysis.py"
#rule sashimi_plot:
# input:
# sashimi_sh = "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.sh",
# sashimi_py = config["sashimi"],
# bams = config["input_bams"],
# gtf = NanoporeGTF
# output:
# "/project/owlmayerTemporary/Sid/isoform_analysis/result/{gene}/{gene}_sashimi.pdf"
# shell:
# "sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}"