Skip to content

Commit

Permalink
Updating scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Siddharth Annaldasula committed Dec 10, 2019
1 parent 010221e commit 3ac2192
Show file tree
Hide file tree
Showing 4 changed files with 606 additions and 227 deletions.
76 changes: 65 additions & 11 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,40 +1,83 @@
import pandas as pd

configfile: "config.yaml"

gene_file = config["gene_file"]

GENES = pd.read_table(gene_file)["gene_symbol"]
NanoporeGTF = config["nanopore_gtf"]
Transcripts = config["polished_reads"]

rule all:
input:
db = "/project/owlmayerTemporary/Sid/blast/test/human.protein.fa"
expand("/home/annaldas/projects/result/{gene}/{gene}_map_protein_analysis.txt", gene = GENES),
expand("/home/annaldas/projects/result/{gene}/{gene}_blastx_protein_analysis.txt", gene = GENES)
#expand("/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf", gene = GENES)

rule gene_transcript:
input:
"/project/owlmayerTemporary/Sid/Nanopore_Results_Strict/Results/GffCompare/nanopore.combined.gtf",
"/project/owlmayerTemporary/Sid/Nanopore_Results_Strict/Results/Pinfish/corrected_transcriptome_polished_collapsed.fas"
NanoporeGTF,
Transcripts
output:
"/home/annaldas/projects/result/{gene}/{gene}_seq.fa"
"/home/annaldas/projects/result/{gene}/{gene}_seq.fa",
"/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh"
params:
gene = "{gene}"
script:
"gene_transcripts.py"

rule blastx:
input:
"/home/annaldas/projects/result/{gene}/{gene}_seq.fa"
gene_fa = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa",
db = config["human_protein"]
output:
"/home/annaldas/projects/result/{gene}/{gene}_blastx.out"
threads:
4
shell:
"/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query {input} -db {rules.all.input.db} -out {output} -evalue 1e-5 -max_target_seqs 1 -max_hsps 1 -outfmt '6 qseqid sseqid evalue' -num_threads {threads} -soft_masking false"
"/home/annaldas/ncbi-blast-2.9.0+/bin/blastx -query {input.gene_fa} -db {input.db} -out {output} -evalue 1e-5 -max_target_seqs 1 -max_hsps 1 -outfmt '6 qseqid sseqid evalue' -num_threads {threads} -soft_masking false"


rule protein_sequence:
input:
"/home/annaldas/projects/result/{gene}/{gene}_blastx.out"
blastx = "/home/annaldas/projects/result/{gene}/{gene}_blastx.out",
db = config["human_protein"]
output:
"/home/annaldas/projects/result/{gene}/{gene}_blastx_protein.fa"
shell:
"sh protein_transcript_sequences.sh {input} {rules.all.input.db} {output}"
"sh protein_transcript_sequences.sh {input.blastx} {input.db} {output}"

rule utr_regions:
input:
config["utr_regions"]
params:
gene = "{gene}"
output:
"/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed"
script:
"utr_regions.py"

rule utr_sequences:
input:
hg = config["human_genome"],
utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.bed"
output:
"/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa"
shell:
"bedtools getfasta -fi {input.hg} -bed {input.utr} -fo {output} -name"

rule transcript_filter_utr:
input:
utr = "/home/annaldas/projects/result/{gene}/{gene}_utr_regions.fa",
seq = "/home/annaldas/projects/result/{gene}/{gene}_seq.fa"
output:
"/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa"
script:
"filter_utr.py"

rule mapping:
input:
"/home/annaldas/projects/result/{gene}/{gene}_seq.fa"
"/home/annaldas/projects/result/{gene}/{gene}_seq_filt.fa"
output:
"/home/annaldas/projects/result/{gene}/{gene}_map_protein.fa"
script:
Expand All @@ -46,7 +89,7 @@ rule interpro_scan:
output:
"/home/annaldas/projects/result/{gene}/{gene}_{type}.gff3"
params:
db = "Pfam,ProDom,Gene3D"
db = "Pfam,ProDom,Gene3D,CDD,Coils,MobiDBLite"
shell:
"sh /home/annaldas/my_interproscan/interproscan-5.38-76.0/interproscan.sh -i {input} -o {output} -f GFF3 -appl {params.db} -dra"

Expand All @@ -58,4 +101,15 @@ rule protein_domain_analysis:
params:
gene = "{gene}"
script:
"interproscan_analysis.py"
"interproscan_analysis.py"

rule sashimi_plot:
input:
sashimi_sh = "/home/annaldas/projects/result/{gene}/{gene}_sashimi.sh",
sashimi_py = config["sashimi"],
bams = config["input_bams"],
gtf = NanoporeGTF
output:
"/home/annaldas/projects/result/{gene}/{gene}_sashimi.pdf"
shell:
"sh {input.sashimi_sh} {input.sashimi_py} {input.bams} {input.gtf} {output}"
34 changes: 30 additions & 4 deletions gene_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
annotate_df = pd.read_csv(annotation_filename,sep = "\t", header = None)
annotate_df = annotate_df[annotate_df[2] != "exon"]
annotate_lines = list(annotate_df[8])
chrms = list(annotate_df[0])
start = list(annotate_df[3])
stop = list(annotate_df[4])


# Mapping gene name to oID
Expand All @@ -19,20 +22,31 @@

gene_oID = dict()
oID_tID = dict()
gene_pos = dict()
tID_pos = dict()
#tID_exon = dict()

for ann in annotate_lines:
if "gene_name" in ann:
line = ann.split(";")
for ann in range(len(annotate_lines)):
if "gene_name" in annotate_lines[ann]:
line = annotate_lines[ann].split(";")
tID = line[0].split(" ")[-1][1:-1]
gene = line[2].split(" ")[-1][1:-1]
oID = line[3].split(" ")[-1][1:-1]
transID = line[4].split(" ")[-1][1:-1].split(".")[0]

if (gene not in gene_oID): gene_oID[gene] = [oID]
else: gene_oID[gene].append(oID)

if (gene not in gene_pos):
gene_pos[gene] = [chrms[ann],start[ann],stop[ann]]
else:
if (start[ann] < gene_pos[gene][1]): gene_pos[gene][1] = start[ann]
if (gene_pos[gene][2] < stop[ann]) : gene_pos[gene][2] = stop[ann]

if (oID not in oID_tID): oID_tID[oID] = tID

if (tID not in tID_pos): tID_pos[tID] = transID#,chrms[ann],start[ann],stop[ann]]


# Import transcript isoform sequences

Expand All @@ -45,7 +59,9 @@
output = []
gene = snakemake.params[0].upper()
for oID in gene_oID[gene]:
tID = ">" + oID_tID[transcripts[oID].id]
tID = oID_tID[transcripts[oID].id]
transID = tID_pos[tID]
tID = ">" + tID + "|" + transID #+ "," + chrm + "," + str(start) + "," + str(stop)
output.append(tID)
seq = str(transcripts[oID].seq)
output.append(seq)
Expand All @@ -55,4 +71,14 @@
output_file.write("\n".join(output))
output_file.close()

# Create sashmimi sh

output_filename = snakemake.output[1]
output_file = open(output_filename,"w+")

chrm,start,stop = gene_pos[gene][0],gene_pos[gene][1],gene_pos[gene][2]
binbash = "#!/bin/bash"
sashimi = "python $1 -b $2 -c %s:%d-%d -g $3 -M 10 -C 3 -O 3 --shrink --alpha 1 --base-size=20 --ann-height=5 --height=7 --width=18 -S both -o $4" %(chrm,start,stop)
output_file.write("\n".join([binbash,sashimi]))
output_file.close()

Loading

0 comments on commit 3ac2192

Please sign in to comment.