gene_transcripts.py

import pandas as pd
import numpy as np
from Bio import SeqIO


# Import the nanopore annotation file

annotation_filename = snakemake.input[0]
annotate_df = pd.read_csv(annotation_filename,sep = "\t", header = None)
annotate_df = annotate_df[annotate_df[2]  != "exon"]
annotate_lines = list(annotate_df[8])
chrms = list(annotate_df[0])
start = list(annotate_df[3])
stop = list(annotate_df[4])


# Mapping gene name to oID
# Mapping oID to transcript id
# Mapping transcript id to exons

gene_oID = dict()
oID_tID = dict()
gene_pos = dict()
tID_pos = dict()
#tID_exon = dict()

for ann in range(len(annotate_lines)):
    if "gene_name" in annotate_lines[ann]:
        line = annotate_lines[ann].split(";")
        tID = line[0].split(" ")[-1][1:-1]
        gene = line[2].split(" ")[-1][1:-1]
        oID = line[3].split(" ")[-1][1:-1]
        transID = line[4].split(" ")[-1][1:-1].split(".")[0]

        if (gene not in gene_oID): gene_oID[gene] = [oID]
        else: gene_oID[gene].append(oID)

        if (gene not in gene_pos):
            gene_pos[gene] = [chrms[ann],start[ann],stop[ann]]
        else:
            if (start[ann] < gene_pos[gene][1]): gene_pos[gene][1] = start[ann]
            if (gene_pos[gene][2] < stop[ann]) : gene_pos[gene][2] = stop[ann]

        if (oID not in oID_tID): oID_tID[oID] = tID

        if (tID not in tID_pos): tID_pos[tID] = transID#,chrms[ann],start[ann],stop[ann]]


# Import transcript isoform sequences

transcripts_filename = snakemake.input[1]
transcripts = SeqIO.index(transcripts_filename, "fasta")


# Extracting isoforms from related genes

output = []
gene = snakemake.params[0]
for oID in gene_oID[gene]:
    tID = oID_tID[transcripts[oID].id]
    transID = tID_pos[tID]
    tID = ">" + tID + "|" + transID #+ "," + chrm + "," + str(start) + "," + str(stop)
    output.append(tID)
    seq = str(transcripts[oID].seq)
    output.append(seq)

output_filename = snakemake.output[0]
output_file = open(output_filename,"w+")
output_file.write("\n".join(output))
output_file.close()

# Create sashmimi sh

#output_filename = snakemake.output[1]
#output_file = open(output_filename,"w+")

#chrm,start,stop = gene_pos[gene][0],gene_pos[gene][1],gene_pos[gene][2]
#binbash = "#!/bin/bash"
#sashimi = "python $1 -b $2 -c %s:%d-%d -g $3 -M 10 -C 3 -O 3 --shrink --alpha 1 --base-size=20 --ann-height=5 --height=7 --width=18 -S both -o $4" %(chrm,start,stop)
#output_file.write("\n".join([binbash,sashimi]))
#output_file.close()
	import pandas as pd
	import numpy as np
	from Bio import SeqIO


	# Import the nanopore annotation file

	annotation_filename = snakemake.input[0]
	annotate_df = pd.read_csv(annotation_filename,sep = "\t", header = None)
	annotate_df = annotate_df[annotate_df[2] != "exon"]
	annotate_lines = list(annotate_df[8])
	chrms = list(annotate_df[0])
	start = list(annotate_df[3])
	stop = list(annotate_df[4])


	# Mapping gene name to oID
	# Mapping oID to transcript id
	# Mapping transcript id to exons

	gene_oID = dict()
	oID_tID = dict()
	gene_pos = dict()
	tID_pos = dict()
	#tID_exon = dict()

	for ann in range(len(annotate_lines)):
	if "gene_name" in annotate_lines[ann]:
	line = annotate_lines[ann].split(";")
	tID = line[0].split(" ")[-1][1:-1]
	gene = line[2].split(" ")[-1][1:-1]
	oID = line[3].split(" ")[-1][1:-1]
	transID = line[4].split(" ")[-1][1:-1].split(".")[0]

	if (gene not in gene_oID): gene_oID[gene] = [oID]
	else: gene_oID[gene].append(oID)

	if (gene not in gene_pos):
	gene_pos[gene] = [chrms[ann],start[ann],stop[ann]]
	else:
	if (start[ann] < gene_pos[gene][1]): gene_pos[gene][1] = start[ann]
	if (gene_pos[gene][2] < stop[ann]) : gene_pos[gene][2] = stop[ann]

	if (oID not in oID_tID): oID_tID[oID] = tID

	if (tID not in tID_pos): tID_pos[tID] = transID#,chrms[ann],start[ann],stop[ann]]


	# Import transcript isoform sequences

	transcripts_filename = snakemake.input[1]
	transcripts = SeqIO.index(transcripts_filename, "fasta")


	# Extracting isoforms from related genes

	output = []
	gene = snakemake.params[0]
	for oID in gene_oID[gene]:
	tID = oID_tID[transcripts[oID].id]
	transID = tID_pos[tID]
	tID = ">" + tID + "\|" + transID #+ "," + chrm + "," + str(start) + "," + str(stop)
	output.append(tID)
	seq = str(transcripts[oID].seq)
	output.append(seq)

	output_filename = snakemake.output[0]
	output_file = open(output_filename,"w+")
	output_file.write("\n".join(output))
	output_file.close()

	# Create sashmimi sh

	#output_filename = snakemake.output[1]
	#output_file = open(output_filename,"w+")

	#chrm,start,stop = gene_pos[gene][0],gene_pos[gene][1],gene_pos[gene][2]
	#binbash = "#!/bin/bash"
	#sashimi = "python $1 -b $2 -c %s:%d-%d -g $3 -M 10 -C 3 -O 3 --shrink --alpha 1 --base-size=20 --ann-height=5 --height=7 --width=18 -S both -o $4" %(chrm,start,stop)
	#output_file.write("\n".join([binbash,sashimi]))
	#output_file.close()