Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import csv
import numpy as np
import joblib
import os
import pandas as pd
import dnn.metaData as metaData
import Bio.SeqIO as sio
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
# get dream data directory
datadir=metaData.datadir
# number of training batches
nbatches=metaData.nbatches
# load tfnames, celltypes, thresholds, replicates
tf2i=metaData.loadMetaDataMap(datadir + "annotations/tf_names.txt")
cell2i=metaData.loadMetaDataMap(datadir + "annotations/cells.txt")
ensid2i=metaData.loadMetaDataMap(datadir + "annotations/genelist.txt")
threshold=metaData.threshold
replicates=metaData.replicates
tfid, tfid2name =metaData.loadTfIds()
ngenes=len(ensid2i)
#read RNA data
# make a numpy 3D array with Ngenes x Ncells x Nrep
totalrna=np.empty([ngenes,len(cell2i),len(replicates)],dtype="float32")
tfrna=np.empty([len(tf2i),len(cell2i),len(replicates)],dtype="float32")
for kreplicate in replicates:
for kcell in cell2i:
print("rna processing " + kcell+":"+kreplicate)
i=0
tsv= open(datadir+"RNAseq/gene_expression."+kcell+"."+kreplicate+".tsv","r")
reader=csv.reader(tsv, delimiter="\t")
header= reader.next()
for line in reader:
val=float(line[5])
totalrna[i,cell2i[kcell],replicates[kreplicate]]=val
i=i+1
if line[0] in tfid:
tfrna[tf2i[tfid2name[line[0]]],cell2i[kcell],replicates[kreplicate]]=val
#store rna-seqt
if not os.path.exists( datadir + "train_input/rna/"):
os.makedirs(datadir + "train_input/rna/")
joblib.dump(totalrna, datadir + "train_input/rna/rna.pkl",protocol=2)
joblib.dump(tfrna, datadir + "train_input/rna/tfrna.pkl",protocol=2)
del totalrna
del tfrna
# count number of valid ranges
#del ranges
# read DNase data
# make a numpy 3D array with Nregions x Ncells x Nthreshold
for dataset in ["train","test","ladder"]:
ranges=metaData.loadValidRanges(dataset)
nranges=len(ranges)
dhs=np.empty([nranges,len(cell2i),len(threshold)],dtype="int8")
for kthreshold in threshold:
for kcell in cell2i:
print("dhs processing " + kcell+":"+kthreshold)
i=0
with open(datadir+"tmp/"+dataset+"/dhs."+kcell+"."+kthreshold+".tsv") as tsv:
for line in csv.reader(tsv, delimiter="\t"):
val=int(line[3])
if val>0:
dhs[i,cell2i[kcell],threshold[kthreshold]]=1
else:
dhs[i,cell2i[kcell],threshold[kthreshold]]=0
i=i+1
#
if not os.path.exists( datadir + dataset+"_input/dhs/"):
os.makedirs(datadir + dataset+"_input/dhs/")
#
joblib.dump(dhs, datadir + dataset+"_input/dhs/dhs.pkl", protocol=2)
for ibatch in range(nbatches):
dhs_=dhs[ibatch::nbatches,:,:]
joblib.dump(dhs_, datadir + dataset+"_input/dhs/dhs."+str(ibatch)+".pkl", protocol=2)
del dhs
del dhs_
# read DNase count data
# make a numpy 3D array with Nregions x Ncells x Nthreshold
import time
indir="/scratch/local2/huska/dhs-counts/tmp.counts/"
for dataset in ["train","test","ladder"]:
print(dataset + ":" + time.strftime("%c"))
ranges=metaData.loadValidRanges(dataset)
nranges=len(ranges)
dhs=np.empty([nranges,len(cell2i)],dtype="float32")
for kcell in cell2i:
print(" dhs count processing " + kcell + time.strftime("%c"))
df = pd.read_csv(indir+dataset+"/DNASE."+kcell+".fc.signal.tab",
sep='\t',
header=None,
usecols=[5])
dhs[:,cell2i[kcell]] = df[5].astype('float32')
#
if not os.path.exists(datadir + dataset+"_input/dhs-sum/"):
os.makedirs(datadir + dataset+"_input/dhs-sum/")
#
joblib.dump(dhs, datadir + dataset+"_input/dhs-sum/dhs.pkl", protocol=2)
for ibatch in range(nbatches):
dhs_=dhs[ibatch::nbatches,:]
joblib.dump(dhs_, datadir + dataset+"_input/dhs-sum/dhs."+str(ibatch)+".pkl", protocol=2)
del dhs
del dhs_
# read ChIP data
# make a numpy 4D array with Nregions x Ncells x Ntfs x Nthreshold
def func(val,kth):
if val=='B':
return 1
elif kth=="relaxed" and val=='A':
return 1
else:
return 0
vfunc=np.vectorize(func)
dataset="train"
ranges=metaData.loadValidRanges(dataset)
nranges=len(ranges)
chip=np.empty([nranges,len(cell2i),len(tf2i)],dtype="int8")
for kthreshold in threshold:
for ktf in tf2i:
tsv= open(datadir+"ChIPseq/labels/"+ktf+"."+"train.labels.tsv","r")
reader=csv.reader(tsv, delimiter="\t")
#use the header to look up cell types
header= reader.next()
header=header[3:]
content=np.empty((nranges,len(header)),dtype=object)
i=0
for line in reader:
content[i,:]=line[3:]
i=i+1
for kcell in cell2i:
if kcell in header:
# 1. if header contains desired cell type, use that data
print("chip processing " + kthreshold+":"+ktf+":"+kcell+" [data available] ")
x=header.index(kcell)
chip[:,cell2i[kcell],tf2i[ktf]]=vfunc(content[:,x],kthreshold)
else:
print("chip processing " + kthreshold+":"+ktf+":"+kcell+" [missing data]")
chip[:,cell2i[kcell],tf2i[ktf]]=-1
#
del content
#
if not os.path.exists( datadir + dataset+"_input/chip/"):
os.makedirs(datadir + dataset+"_input/chip/")
joblib.dump(chip, datadir + dataset+"_input/chip/chip."+kthreshold+".pkl", protocol=2)
for ibatch in range(nbatches):
chip_=chip[ibatch::nbatches,:,:]
joblib.dump(chip_, datadir + dataset+"_input/chip/chip."+kthreshold+"."+str(ibatch)+".pkl", protocol=2)
del chip
del chip_
#aggregated chip-seq peaks
def binarize(x):
if x>0:
return 1
else:
return 0
import dnn.loaddata as Loader
dataset="train"
from sklearn.preprocessing import binarize
chip=Loader.loadChip(None,"conservative")
chip_=chip.reshape((chip.shape[0],chip.shape[1]*chip.shape[2]))
chip_=binarize(chip_)
chip=chip_.reshape(chip.shape)
chip=np.sum(chip,axis=1)
chip=binarize(chip)
chip=chip.astype("int8")
joblib.dump(chip, datadir + dataset+"_input/chip/chip.aggregate.conservative.pkl", protocol=2)
for ibatch in range(nbatches):
chip=Loader.loadChip(ibatch,"conservative")
#chip=vbinarize(chip)
chip_=chip.reshape((chip.shape[0],chip.shape[1]*chip.shape[2]))
chip_=binarize(chip_)
chip=chip_.reshape(chip.shape)
chip=np.sum(chip,axis=1)
chip=binarize(chip)
chip1=chip.astype("int8")
joblib.dump(chip1, datadir + dataset+"_input/chip/chip.aggregate.conservative."+str(ibatch)+".pkl", protocol=2)
chip=Loader.loadChip(ibatch,"relaxed")
#chip=binarize(chip)
chip_=chip.reshape((chip.shape[0],chip.shape[1]*chip.shape[2]))
chip_=binarize(chip_)
chip=chip_.reshape(chip.shape)
chip=np.sum(chip,axis=1)
chip=vbinarize(chip)
chip2=chip.astype("int8")
joblib.dump(chip2, datadir + dataset+"_input/chip/chip.aggregate.relaxed."+str(ibatch)+".pkl", protocol=2)
chip=np.concatenate((chip1[:,:,np.newaxis],chip2[:,:,np.newaxis]),axis=2)
joblib.dump(chip, datadir + dataset+"_input/chip/chip.aggregate."+str(ibatch)+".pkl", protocol=2)
#load fasta file into one-hot representation and store it with joblib
L=dict({'A':0,'a':0,'C':1,'c':1,'G':2,'g':2,'T':3,'t':3})
for dataset in ["ladder","test","train"]:
#for dataset in ["ladder","test"]:
ranges=metaData.loadValidRanges(dataset)
nranges=len(ranges)
rawdna = []
for seq in sio.parse(open(datadir + "tmp/"+dataset+"_dna.fasta"), 'fasta', IUPAC.unambiguous_dna):
rawdna.append(seq.seq)
#
dna=np.zeros((nranges,1,4,len(rawdna[0])),dtype="int8")
k=0
for i in range(nranges):
k=k+1
if k>=10000:
print(i)
k=0
for j in range(len(rawdna[0])):
if rawdna[i][j]=="N" or rawdna[i][j]=="n":
continue
else:
dna[i,0,L[rawdna[i][j]],j]=1
#
if not os.path.exists( datadir + dataset+"_input/dna/"):
os.makedirs(datadir + dataset+"_input/dna/")
#
dna_=np.packbits(dna,axis=-1)
joblib.dump(dna_, datadir + dataset+"_input/dna/dna.pkl", protocol=2)
for ibatch in range(nbatches):
dna_=dna[ibatch::nbatches,:,:]
joblib.dump(dna_, datadir + dataset+"_input/dna/dna."+str(ibatch)+".pkl", protocol=2)
del dna
del dna_
dists=metaData.dists
# create locus to gene map using an array
for d in range(len(dists)):
for dataset in ["train","test","ladder"]:
ranges=metaData.loadValidRanges(dataset)
nranges=len(ranges)
tsv= open(datadir+"tmp/"+dataset+
"_geneneighbourhood."+str(dists[d])+".bed","r")
reader=csv.reader(tsv, delimiter="\t")
loc2gene=[ [] for _ in range(nranges)]
for line in reader:
loc2gene[int(line[0])].append(ensid2i[line[1]])
if not os.path.exists( datadir + dataset+"_input/loc2gene/"):
os.makedirs(datadir + dataset+"_input/loc2gene/")
joblib.dump(loc2gene, datadir + dataset+
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl", protocol=2)
for ibatch in range(nbatches):
loc2gene_=loc2gene[ibatch::nbatches]
joblib.dump(loc2gene_, datadir + dataset+
"_input/loc2gene/loc2gene."+
str(dists[d])+"."+str(ibatch)+".pkl", protocol=2)
for d in range(len(dists)-1,0,-1):
for dataset in ["train","test","ladder"]:
loc2gene_prev=joblib.load(datadir + dataset+
"_input/loc2gene/loc2gene."+str(dists[d-1])+".pkl")
loc2gene=joblib.load(datadir + dataset+
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl")
#
for i in range(len(loc2gene)):
loc2gene[i]=list(set(loc2gene[i])-set(loc2gene_prev[i]))
#
joblib.dump(loc2gene, datadir + dataset+
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl", protocol=2)
for ibatch in range(nbatches):
loc2gene_=loc2gene[ibatch::nbatches]
joblib.dump(loc2gene_, datadir + dataset+
"_input/loc2gene/loc2gene."+
str(dists[d])+"."+str(ibatch)+".pkl", protocol=2)
totalrna_=np.mean(totalrna,axis=2)
#metaData.maxneighbours=26
for dataset in ["train","ladder","test"]:
# loc2gene=[]
#for dataset in ["test"]:
loc2gene=joblib.load(datadir + dataset+
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl")
neighbourrna=np.zeros((len(loc2gene),len(dists),
len(cell2i),metaData.maxneighbours),dtype="float32")
for d in range(len(dists)):
loc2gene=joblib.load(datadir + dataset+
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl")
#loc2gene=joblib.load(datadir + dataset+
# "_input/loc2gene/loc2gene."+str(dists[d])+".pkl")
for i in range(len(loc2gene)):
if len(loc2gene[i])>0:
neighbourrna[i,d,:,:min(len(loc2gene[i]),
metaData.maxneighbours)]=totalrna_[:min(len(loc2gene[i]),metaData.maxneighbours),:].T
#
if not os.path.exists( datadir + dataset+"_input/rna"):
os.makedirs(datadir + dataset+"_input/rna/",mode=0755)
joblib.dump(neighbourrna,datadir+dataset+
"_input/rna/neighbouring_rnaseq.pkl",protocol=2)
#
dists=metaData.dists
gene2chip=np.empty([ngenes,len(cell2i),len(tf2i),len(dists)],dtype="int8")
dataset="train"
if not os.path.exists( datadir + dataset+"_input/gene2chip"):
os.makedirs(datadir + dataset+"_input/gene2chip/",mode=0755)
def subt(x,y):
if x>0 and y>0:
return 0
else:
return x
vsubt=np.vectorize(subt,otypes=[np.dtype("int8")])
for kthres in threshold:
for d in range(len(dists)):
for ktf in tf2i:
for kcell in cell2i:
print("gene2chip processing " + kcell+":"+ktf+":"+kthres)
if not os.path.isfile( datadir+ "tmp/gene2chip/"+kcell+"."+ktf+"."+str(dists[d])+"."+kthres+".txt"):
gene2chip[:,cell2i[kcell],tf2i[ktf],d]=-1
else:
with open(datadir+"tmp/gene2chip/"+kcell+"."+ktf+"."+str(dists[d])+"."+kthres+".txt") as tsv:
i=0
for line in csv.reader(tsv, delimiter="\t"):
if line[0] in ['chr1','chr8','chr21']:
#held out chromosomes are devoid of chip-seq peaks
gene2chip[i,cell2i[kcell],tf2i[ktf],d]=-1
else:
val=int(line[1])
if val>0:
gene2chip[i,cell2i[kcell],tf2i[ktf],d]=1
else:
gene2chip[i,cell2i[kcell],tf2i[ktf],d]=0
i=i+1
#
for d in range(len(dists)-1,0,-1):
gene2chip[:,:,:,d]=vsubt(gene2chip[:,:,:,d],gene2chip[:,:,:,d-1])
#
joblib.dump(gene2chip, datadir + dataset+"_input/gene2chip/gene2chip."+kthres+".pkl", protocol=2)