Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
first commit
  • Loading branch information
Martyna Gajos committed Apr 6, 2021
0 parents commit 79906d9
Show file tree
Hide file tree
Showing 13 changed files with 468,401 additions and 0 deletions.
454,502 changes: 454,502 additions & 0 deletions Data/Regions.bed

Large diffs are not rendered by default.

13,457 changes: 13,457 additions & 0 deletions Data/masked.bed

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.md
@@ -0,0 +1 @@
# Pervasive_PolII_pausing
46 changes: 46 additions & 0 deletions Scripts/assignRegion.py
@@ -0,0 +1,46 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import pandas as pd

def map2Regions(row,df,bed_type=6):
data=df[df[0]==row[0]]
data=data[data[1]<=row[1]]
data=data[data[2]>=row[2]]
if bed_type==6:
data=data[data[5]==row[5]]
if data.shape[0]==0:
return 'intergenic_intergenic'
elif data.shape[0]==1:
return data[3].iloc[0]
else:
if set(data[3].values)==1:
return 'multiple_'+data[3].iloc[0]
else:
return 'multiple_multiple'

df=pd.read_csv(snakemake.input[0],sep='\t',header=None)
bed_type=df.shape[1]
df['seqname']=df[0]
df['source']='peakCalling'
df['feature']='peak'
df['start']=df[1]+1
df['end']=df[2]
df['frame']=0
df['score']=(df[4] if bed_type>=6 else '.')
df['strand']=(df[5] if bed_type>=6 else '.')

if snakemake.params[0]=='':
df['attribute']='.'
else:
afile=pd.read_csv(snakemake.params[0],sep='\t',header=None)
df['attribute']=df.apply(lambda x: map2Regions(x,afile,bed_type),1)

bed=df[['seqname',1,'end','attribute','score','strand']]
bed.to_csv(snakemake.output[1],header=None,index=None,sep='\t')
bed['region']=df.apply(lambda x: x['attribute'].split('_')[1],1)
bed=bed[['seqname',1,'end','region','score','strand']]
bed.to_csv(snakemake.output[2],header=None,index=None,sep='\t')

df=df[['seqname','source','feature','start','end','score','strand','frame','attribute']]
df.to_csv(snakemake.output[0],header=None,index=None,sep='\t')

105 changes: 105 additions & 0 deletions Scripts/createPausingStats.py
@@ -0,0 +1,105 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import argparse
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

def getarg():
parser = argparse.ArgumentParser(description = "To be added")
parser.add_argument("-f", "--File", help = "GFF file with pausing sites. Example: /path/*.gff", required = True)
parser.add_argument("-o", "--Output", help = "Path to output plot. Example: /path/*.pdf", required = False, default = 'PausingDistribution.pdf')
parser.add_argument("-a", "--Annotation", help = "GFF file with gene annotation. Example: /path/*.gff", required = True)
parser.add_argument("-i", "--Isoforms", help = "Path to RSEM isoform file results (directory). Example: /path/", required = False, default = False)
arg = parser.parse_args()
return arg.File, arg.Output, arg.Annotation , arg.Isoforms

def returnExons(afile, activeTs=[]):
d={}
with open(afile,'r') as af:
for linia in af:
if linia[0]!='#':
ln=linia.strip().split()
if ln[2]=='exon':
if (ln[8].split('ID=exon:')[1].split(':')[0] in activeTs) or (len(activeTs)==0):
gene_id=ln[8].split('gene_id=')[1].split(';')[0]
try:
d[gene_id].append((int(ln[3]),int(ln[4])))
except:
d[gene_id]=[(int(ln[3]),int(ln[4]))]
return d

def isExon(row,dic):
reg="intron"
if row["gene"] in set(dic.keys()):
for tup in dic[row["gene"]]:
if tup[0]<=row["start"]<=tup[1]:
reg="exon"
break
return reg

#inputfile, outputfile, annotation, isoforms=getarg()
inputfile='/project/owlmayerTemporary/Martyna/NET_pro/Results/GRCh38p12_HeLaS3.none.Rep1.gff'
annotation='/project/PausingDynamics/GeneralResources/Genecode29/gencode.v29.annotation.sorted.gff3'
outputfile='/project/owlmayerTemporary/Martyna/NET_pro/Results/GRCh38p12_HeLaS3.none.Rep1.pdf'
isoforms=False
df=pd.read_csv(inputfile,names=['start','score','attribute'],usecols=[3,5,8],sep='\t')
df['gene']=df.apply(lambda x: x['attribute'].split('_')[0],1)
df['region']=df.apply(lambda x: x['attribute'].split('_')[1],1)
df=df[~(df["region"].isin(["OP","NC","RNA"]))]
df=df.replace({'GB':'gene\nbody','PP':'promoter\nproximal',
'CA':'convergent\nantisense','DA':'divergent\nantisense','AS':'antisense',
'multiple':'undetermined','TW':'termination\nwindow'})
order=['gene\nbody', 'intergenic', 'convergent\nantisense','promoter\nproximal',
'divergent\nantisense', 'antisense', 'undetermined', 'termination\nwindow']
colors={}
for i in range(len(order)):
colors[order[i]]=sns.husl_palette(len(order), s=0.7 )[i]

#intron/exon
if isoforms:
pct=10
l=[]
for fn in os.listdir(isoforms+'data/'):
if fn.endswith('.isoforms.results'):
sample=fn.split('.')[0]
data=pd.read_csv(isoforms+'data/'+sample+'.isoforms.results',sep='\t',usecols=[0,1,7])
data=data.set_index(['transcript_id','gene_id'])
data=data.rename({"IsoPct":"P_"+sample},axis='columns')
l.append(data)
data=l[0]
for i in range(1,len(l)):
data=data.join(l[i],how="outer")
data=data.fillna(0)
data=data[data.mean(axis=1)>pct]
activeTranscripts=set([i[0] for i in data.index.values])
del(data)
exons=returnExons(annotation,activeTranscripts)
else:
exons=returnExons(annotation)

gb=df[df["region"]=="gene\nbody"]
gb["part"]=gb.apply(lambda x: isExon(x,exons),1)

cap='number of pausing sites of the type'
fig, ax = plt.subplots(figsize=(5,6))
cur_axes = plt.gca()
ax.spines['bottom'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
percentage=(df['region'].value_counts()/df['region'].value_counts().sum()*100).to_frame().sort_values(by='region',ascending=False)
sns.countplot(y="region",data=df,order = percentage.index,palette = [colors[i] for i in percentage.index])
ax.set_xlabel(cap)
for i in range(0,percentage.shape[0]):
p=ax.patches[i]
h=percentage.iloc[i]['region']
ax.annotate('{:.1f}%'.format(h), (p.get_width()+10, p.get_y()-0.5*p.get_height()-0.15+1))
plt.subplots_adjust(left=0.27, right=0.9, top=0.9, bottom=0.05)
gb_index=list(percentage.index).index('gene\nbody')
rect = plt.Rectangle((ax.patches[gb_index].get_x(),ax.patches[gb_index].get_y()), gb.groupby("part").count()["region"]["exon"], ax.patches[gb_index].get_height(), color='k', alpha=0.3)
ax.add_patch(rect)
ax.annotate('E', (gb.groupby("part").count()["region"]["exon"]*0.5, ax.patches[gb_index].get_y()-0.5*ax.patches[gb_index].get_height()-0.15+1),color='white')
ax.annotate('I', (gb.groupby("part").count()["region"]["exon"]+gb.groupby("part").count()["region"]["intron"]*0.5, ax.patches[0].get_y()-0.5*ax.patches[gb_index].get_height()-0.15+1),color='white')
plt.savefig(outputfile)
14 changes: 14 additions & 0 deletions Scripts/filterPausing.py
@@ -0,0 +1,14 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
import pandas as pd

dfs=[]
for fn in snakemake.input:
if os.stat(fn).st_size == 0:
continue
df=pd.read_csv(fn,delimiter='\t',header=None)
dfs.append(df)
df=pd.concat(dfs,sort=[0,1],ignore_index=True)
df.to_csv(snakemake.output[1],sep='\t',index=False,header=False)
df[df[4]>=snakemake.params[0]].to_csv(snakemake.output[0],sep='\t',index=False,header=False)
46 changes: 46 additions & 0 deletions Scripts/getPausing.py
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np

if os.stat(snakemake.input[0]).st_size == 0:
with open(snakemake.output[0],'w') as f:
exit()

strands={'pos':'+',
'neg':'-'}
try:
strand=strands[snakemake.params[4]]
except:
strand=snakemake.params[4]

df=pd.read_csv(snakemake.input[0],sep='\t',header=None)

flanks=(snakemake.params[0]-1)//2
pos_0=df[1].min()
signal_len=df[2].max()-pos_0

signal=np.zeros(signal_len,dtype=np.int)
for row in df.iterrows():
start=row[1][1]-pos_0
stop=row[1][2]-pos_0
value=row[1][3]
for i in range(start,stop):
signal[i]=value

max_left=np.sign(signal-np.pad(signal,1,'constant')[2:])
max_right=np.sign(signal-np.pad(signal,1,'constant')[:-2])

threshold=signal-snakemake.params[1]

window=np.ones(flanks*2+1)
surroundings=np.convolve(signal,window,'same')
nonzero=np.convolve((signal>0)*1,window,'same')

potential_max=max_left*max_right*threshold
potential_max_positions=np.where(potential_max>0)[0]
with open(snakemake.output[0],'w') as f:
for i in potential_max_positions:
if max_left[i]>0 and max_right[i]>0 and threshold[i]>0:
f.write(snakemake.params[3]+'\t'+str(i+pos_0)+'\t'+str(i+pos_0+1)+'\t'+str(signal[i])+'\t'+str(int(surroundings[i]))+'\t'+strand+'\t'+str(int(nonzero[i]))+'\n')
13 changes: 13 additions & 0 deletions Scripts/listContigs.py
@@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd

contigs=set()
for fn in snakemake.input:
df=pd.read_csv(fn, delimiter='\t', usecols=[0],header=None)[0].values
contigs=contigs.union(set(list(df)))
contigs=list(contigs)
contigs.sort()
with open(snakemake.output[0],'w') as fn:
fn.writelines(["%s\n" % item for item in contigs])

18 changes: 18 additions & 0 deletions Scripts/maskRegion.py
@@ -0,0 +1,18 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
import pandas as pd
import pybedtools

if os.stat(snakemake.input[0]).st_size == 0:
with open(snakemake.output[0],'w') as f:
exit()

ps = pybedtools.BedTool(snakemake.input[0])
if snakemake.params[0]=='':
df=ps
else:
mf = pybedtools.BedTool(snakemake.params[0])
df = ps.subtract(mf, s=True)
df.moveto(snakemake.output[0])

35 changes: 35 additions & 0 deletions Scripts/testPausing.py
@@ -0,0 +1,35 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np
import scipy.stats as st

def calculatePercentile(peak,reads,positions,bootstrap_N):
bootstrap_table=np.zeros((bootstrap_N,reads),dtype=np.int)
for i in range(bootstrap_N):
bootstrap_table[i]=np.random.randint(positions,size=reads)
maxi=np.apply_along_axis(getMax,1,bootstrap_table)
return st.percentileofscore(maxi,peak,'mean')

def getMax(x):
return max(np.bincount(x))

strands={'pos':'+',
'neg':'-'}

if os.stat(snakemake.input[0]).st_size == 0:
with open(snakemake.output[0],'w') as f:
exit()

df=pd.read_csv(snakemake.input[0],delimiter='\t',header=None)
df['strand']=df.apply(lambda x:
strands[snakemake.params[2]] if snakemake.params[2] in strands.keys() else '.',1)

if snakemake.params[3]=='poisson':
df['score']=df.apply(lambda x: st.poisson(x[4]*1./x[6]).cdf(x[3])*100,1)
else:
df['score']=df.apply(lambda x: calculatePercentile(x[3],x[4],x[6],snakemake.params[0]),1)
df=df[[0,1,2,3,'score','strand']]
df=df.sort_values(by=[0,1])
df.to_csv(snakemake.output[0],header=None,index=None,sep='\t')
35 changes: 35 additions & 0 deletions Scripts/testPausing_nb.py
@@ -0,0 +1,35 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np
import scipy.stats as st

def calculatePercentile(peak,reads,positions,bootstrap_N):
bootstrap_table=np.zeros((bootstrap_N,reads),dtype=np.int)
for i in range(bootstrap_N):
bootstrap_table[i]=np.random.randint(positions,size=reads)
maxi=np.apply_along_axis(getMax,1,bootstrap_table)
return st.percentileofscore(maxi,peak,'mean')

def getMax(x):
return max(np.bincount(x))

def getPercentileNB(peak,reads,positions):
return st.nbinom.cdf(peak,reads,1-1./positions)

strands={'pos':'+',
'neg':'-'}

if os.stat(snakemake.input[0]).st_size == 0:
with open(snakemake.output[0],'w') as f:
exit()

df=pd.read_csv(snakemake.input[0],delimiter='\t',header=None)
df['strand']=df.apply(lambda x:
strands[snakemake.params[2]] if snakemake.params[2] in strands.keys() else '.',1)

df['score']=df.apply(lambda x: calculatePercentile(x[3],x[4],x[6],snakemake.params[0]),1)
df=df[[0,1,2,3,'score','strand']]
df=df.sort_values(by=[0,1])
df.to_csv(snakemake.output[0],header=None,index=None,sep='\t')

0 comments on commit 79906d9

Please sign in to comment.