Skip to content
Permalink
c1e8495b8e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 88 lines (73 sloc) 2.93 KB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import pybedtools
import matplotlib.pylab as plt
fq=snakemake.input[0]
strands={"pos":"+","neg":"-"}
d={"pos":{},"neg":{}}
d["pos"]["all"]=snakemake.input[1]
d["neg"]["all"]=snakemake.input[2]
d["pos"]["noPCR"]=snakemake.input[3]
d["neg"]["noPCR"]=snakemake.input[4]
d["pos"]["noPCR_noSI"]=snakemake.input[5]
d["neg"]["noPCR_noSI"]=snakemake.input[6]
mfile=snakemake.params[0]
prefix=snakemake.params[1]
downstream=snakemake.params[2]
b={"pos":{},"neg":{}}
for s in d.keys():
for c in d[s].keys():
b[s][c]=pybedtools.BedTool(d[s][c])
masking=pd.read_csv(mfile,sep='\t',header=None)
masking[1]=masking.apply(lambda x: x[1]-downstream if (x[5]=='-' and x[0]!='chrM') else x[1],1)
masking[2]=masking.apply(lambda x: x[2]+downstream if x[5]=='+' else x[2],1)
classes=['PolM','PolI','PolIII','snRNA','snoRNA','miRNA']
stats=pd.DataFrame(0, index=["unmapped","allMapped"]+classes+["spikeIns","informative\nReads"], columns=["all","noPCR","noPCR_noSI"])
#allReads
l=0
with open(fq,'r') as f:
for ln in f:
l+=1
allReads=l/4
#allMapped
for s in b.keys():
for t in b[s].keys():
temp_df=b[s][t].to_dataframe()
stats.loc["allMapped"][t]+=temp_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum()
#classes
for c in classes:
for s in b.keys():
for t in b[s].keys():
temp_masking=masking[masking[3]==c]
temp_masking=temp_masking[temp_masking[5]==strands[s]]
masking_bed=pybedtools.BedTool.from_dataframe(temp_masking)
masking_bed=masking_bed.merge(s=True,c=[4,5,6],o='distinct')
temp_bed=b[s][t].intersect(masking_bed)
temp_df=temp_bed.to_dataframe()
stats.loc[c][t]+=temp_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum()
b[s][t]=b[s][t].subtract(masking_bed)
#spikeins and rest
for s in b.keys():
for t in b[s].keys():
temp_df=b[s][t].to_dataframe()
spikein_df=temp_df[temp_df['chrom'].str.startswith(prefix)]
stats.loc["spikeIns"][t]+=spikein_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum()
temp_df=temp_df[~temp_df['chrom'].str.startswith(prefix)]
stats.loc["informative\nReads"][t]+=temp_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum()
stats['PCR duplicates']=stats['all']-stats["noPCR"]
stats['Splicing Intermediates']=stats['noPCR']-stats["noPCR_noSI"]
stats['Unique']=stats["noPCR_noSI"]
stats.loc["unmapped"]["Unique"]=allReads-stats.loc["allMapped"]["all"]
totals=stats[["Unique","Splicing Intermediates","PCR duplicates"]]
totals=totals[totals.index!="allMapped"]
percents=totals*100/allReads
totals.to_csv(snakemake.output[0],sep='\t')
totals.plot.bar(stacked=True)
plt.ylabel("Number of reads")
plt.tight_layout()
plt.savefig(snakemake.output[1])
percents.plot.bar(stacked=True)
plt.ylabel("% of reads")
plt.tight_layout()
plt.savefig(snakemake.output[2])