Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BlueWhale/preprocessing.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
356 lines (317 sloc)
13 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
import joblib | |
import os | |
import pandas as pd | |
import dnn.metaData as metaData | |
import Bio.SeqIO as sio | |
from Bio.Alphabet import IUPAC | |
from Bio.Seq import Seq | |
# get dream data directory | |
datadir=metaData.datadir | |
# number of training batches | |
nbatches=metaData.nbatches | |
# load tfnames, celltypes, thresholds, replicates | |
tf2i=metaData.loadMetaDataMap(datadir + "annotations/tf_names.txt") | |
cell2i=metaData.loadMetaDataMap(datadir + "annotations/cells.txt") | |
ensid2i=metaData.loadMetaDataMap(datadir + "annotations/genelist.txt") | |
threshold=metaData.threshold | |
replicates=metaData.replicates | |
tfid, tfid2name =metaData.loadTfIds() | |
ngenes=len(ensid2i) | |
#read RNA data | |
# make a numpy 3D array with Ngenes x Ncells x Nrep | |
totalrna=np.empty([ngenes,len(cell2i),len(replicates)],dtype="float32") | |
tfrna=np.empty([len(tf2i),len(cell2i),len(replicates)],dtype="float32") | |
for kreplicate in replicates: | |
for kcell in cell2i: | |
print("rna processing " + kcell+":"+kreplicate) | |
i=0 | |
tsv= open(datadir+"RNAseq/gene_expression."+kcell+"."+kreplicate+".tsv","r") | |
reader=csv.reader(tsv, delimiter="\t") | |
header= reader.next() | |
for line in reader: | |
val=float(line[5]) | |
totalrna[i,cell2i[kcell],replicates[kreplicate]]=val | |
i=i+1 | |
if line[0] in tfid: | |
tfrna[tf2i[tfid2name[line[0]]],cell2i[kcell],replicates[kreplicate]]=val | |
#store rna-seqt | |
if not os.path.exists( datadir + "train_input/rna/"): | |
os.makedirs(datadir + "train_input/rna/") | |
joblib.dump(totalrna, datadir + "train_input/rna/rna.pkl",protocol=2) | |
joblib.dump(tfrna, datadir + "train_input/rna/tfrna.pkl",protocol=2) | |
del totalrna | |
del tfrna | |
# count number of valid ranges | |
#del ranges | |
# read DNase data | |
# make a numpy 3D array with Nregions x Ncells x Nthreshold | |
for dataset in ["train","test","ladder"]: | |
ranges=metaData.loadValidRanges(dataset) | |
nranges=len(ranges) | |
dhs=np.empty([nranges,len(cell2i),len(threshold)],dtype="int8") | |
for kthreshold in threshold: | |
for kcell in cell2i: | |
print("dhs processing " + kcell+":"+kthreshold) | |
i=0 | |
with open(datadir+"tmp/"+dataset+"/dhs."+kcell+"."+kthreshold+".tsv") as tsv: | |
for line in csv.reader(tsv, delimiter="\t"): | |
val=int(line[3]) | |
if val>0: | |
dhs[i,cell2i[kcell],threshold[kthreshold]]=1 | |
else: | |
dhs[i,cell2i[kcell],threshold[kthreshold]]=0 | |
i=i+1 | |
# | |
if not os.path.exists( datadir + dataset+"_input/dhs/"): | |
os.makedirs(datadir + dataset+"_input/dhs/") | |
# | |
joblib.dump(dhs, datadir + dataset+"_input/dhs/dhs.pkl", protocol=2) | |
for ibatch in range(nbatches): | |
dhs_=dhs[ibatch::nbatches,:,:] | |
joblib.dump(dhs_, datadir + dataset+"_input/dhs/dhs."+str(ibatch)+".pkl", protocol=2) | |
del dhs | |
del dhs_ | |
# read DNase count data | |
# make a numpy 3D array with Nregions x Ncells x Nthreshold | |
import time | |
indir="/scratch/local2/huska/dhs-counts/tmp.counts/" | |
for dataset in ["train","test","ladder"]: | |
print(dataset + ":" + time.strftime("%c")) | |
ranges=metaData.loadValidRanges(dataset) | |
nranges=len(ranges) | |
dhs=np.empty([nranges,len(cell2i)],dtype="float32") | |
for kcell in cell2i: | |
print(" dhs count processing " + kcell + time.strftime("%c")) | |
df = pd.read_csv(indir+dataset+"/DNASE."+kcell+".fc.signal.tab", | |
sep='\t', | |
header=None, | |
usecols=[5]) | |
dhs[:,cell2i[kcell]] = df[5].astype('float32') | |
# | |
if not os.path.exists(datadir + dataset+"_input/dhs-sum/"): | |
os.makedirs(datadir + dataset+"_input/dhs-sum/") | |
# | |
joblib.dump(dhs, datadir + dataset+"_input/dhs-sum/dhs.pkl", protocol=2) | |
for ibatch in range(nbatches): | |
dhs_=dhs[ibatch::nbatches,:] | |
joblib.dump(dhs_, datadir + dataset+"_input/dhs-sum/dhs."+str(ibatch)+".pkl", protocol=2) | |
del dhs | |
del dhs_ | |
# read ChIP data | |
# make a numpy 4D array with Nregions x Ncells x Ntfs x Nthreshold | |
def func(val,kth): | |
if val=='B': | |
return 1 | |
elif kth=="relaxed" and val=='A': | |
return 1 | |
else: | |
return 0 | |
vfunc=np.vectorize(func) | |
dataset="train" | |
ranges=metaData.loadValidRanges(dataset) | |
nranges=len(ranges) | |
chip=np.empty([nranges,len(cell2i),len(tf2i)],dtype="int8") | |
for kthreshold in threshold: | |
for ktf in tf2i: | |
tsv= open(datadir+"ChIPseq/labels/"+ktf+"."+"train.labels.tsv","r") | |
reader=csv.reader(tsv, delimiter="\t") | |
#use the header to look up cell types | |
header= reader.next() | |
header=header[3:] | |
content=np.empty((nranges,len(header)),dtype=object) | |
i=0 | |
for line in reader: | |
content[i,:]=line[3:] | |
i=i+1 | |
for kcell in cell2i: | |
if kcell in header: | |
# 1. if header contains desired cell type, use that data | |
print("chip processing " + kthreshold+":"+ktf+":"+kcell+" [data available] ") | |
x=header.index(kcell) | |
chip[:,cell2i[kcell],tf2i[ktf]]=vfunc(content[:,x],kthreshold) | |
else: | |
print("chip processing " + kthreshold+":"+ktf+":"+kcell+" [missing data]") | |
chip[:,cell2i[kcell],tf2i[ktf]]=-1 | |
# | |
del content | |
# | |
if not os.path.exists( datadir + dataset+"_input/chip/"): | |
os.makedirs(datadir + dataset+"_input/chip/") | |
joblib.dump(chip, datadir + dataset+"_input/chip/chip."+kthreshold+".pkl", protocol=2) | |
for ibatch in range(nbatches): | |
chip_=chip[ibatch::nbatches,:,:] | |
joblib.dump(chip_, datadir + dataset+"_input/chip/chip."+kthreshold+"."+str(ibatch)+".pkl", protocol=2) | |
del chip | |
del chip_ | |
#aggregated chip-seq peaks | |
def binarize(x): | |
if x>0: | |
return 1 | |
else: | |
return 0 | |
import dnn.loaddata as Loader | |
dataset="train" | |
from sklearn.preprocessing import binarize | |
chip=Loader.loadChip(None,"conservative") | |
chip_=chip.reshape((chip.shape[0],chip.shape[1]*chip.shape[2])) | |
chip_=binarize(chip_) | |
chip=chip_.reshape(chip.shape) | |
chip=np.sum(chip,axis=1) | |
chip=binarize(chip) | |
chip=chip.astype("int8") | |
joblib.dump(chip, datadir + dataset+"_input/chip/chip.aggregate.conservative.pkl", protocol=2) | |
for ibatch in range(nbatches): | |
chip=Loader.loadChip(ibatch,"conservative") | |
#chip=vbinarize(chip) | |
chip_=chip.reshape((chip.shape[0],chip.shape[1]*chip.shape[2])) | |
chip_=binarize(chip_) | |
chip=chip_.reshape(chip.shape) | |
chip=np.sum(chip,axis=1) | |
chip=binarize(chip) | |
chip1=chip.astype("int8") | |
joblib.dump(chip1, datadir + dataset+"_input/chip/chip.aggregate.conservative."+str(ibatch)+".pkl", protocol=2) | |
chip=Loader.loadChip(ibatch,"relaxed") | |
#chip=binarize(chip) | |
chip_=chip.reshape((chip.shape[0],chip.shape[1]*chip.shape[2])) | |
chip_=binarize(chip_) | |
chip=chip_.reshape(chip.shape) | |
chip=np.sum(chip,axis=1) | |
chip=vbinarize(chip) | |
chip2=chip.astype("int8") | |
joblib.dump(chip2, datadir + dataset+"_input/chip/chip.aggregate.relaxed."+str(ibatch)+".pkl", protocol=2) | |
chip=np.concatenate((chip1[:,:,np.newaxis],chip2[:,:,np.newaxis]),axis=2) | |
joblib.dump(chip, datadir + dataset+"_input/chip/chip.aggregate."+str(ibatch)+".pkl", protocol=2) | |
#load fasta file into one-hot representation and store it with joblib | |
L=dict({'A':0,'a':0,'C':1,'c':1,'G':2,'g':2,'T':3,'t':3}) | |
for dataset in ["ladder","test","train"]: | |
#for dataset in ["ladder","test"]: | |
ranges=metaData.loadValidRanges(dataset) | |
nranges=len(ranges) | |
rawdna = [] | |
for seq in sio.parse(open(datadir + "tmp/"+dataset+"_dna.fasta"), 'fasta', IUPAC.unambiguous_dna): | |
rawdna.append(seq.seq) | |
# | |
dna=np.zeros((nranges,1,4,len(rawdna[0])),dtype="int8") | |
k=0 | |
for i in range(nranges): | |
k=k+1 | |
if k>=10000: | |
print(i) | |
k=0 | |
for j in range(len(rawdna[0])): | |
if rawdna[i][j]=="N" or rawdna[i][j]=="n": | |
continue | |
else: | |
dna[i,0,L[rawdna[i][j]],j]=1 | |
# | |
if not os.path.exists( datadir + dataset+"_input/dna/"): | |
os.makedirs(datadir + dataset+"_input/dna/") | |
# | |
dna_=np.packbits(dna,axis=-1) | |
joblib.dump(dna_, datadir + dataset+"_input/dna/dna.pkl", protocol=2) | |
for ibatch in range(nbatches): | |
dna_=dna[ibatch::nbatches,:,:] | |
joblib.dump(dna_, datadir + dataset+"_input/dna/dna."+str(ibatch)+".pkl", protocol=2) | |
del dna | |
del dna_ | |
dists=metaData.dists | |
# create locus to gene map using an array | |
for d in range(len(dists)): | |
for dataset in ["train","test","ladder"]: | |
ranges=metaData.loadValidRanges(dataset) | |
nranges=len(ranges) | |
tsv= open(datadir+"tmp/"+dataset+ | |
"_geneneighbourhood."+str(dists[d])+".bed","r") | |
reader=csv.reader(tsv, delimiter="\t") | |
loc2gene=[ [] for _ in range(nranges)] | |
for line in reader: | |
loc2gene[int(line[0])].append(ensid2i[line[1]]) | |
if not os.path.exists( datadir + dataset+"_input/loc2gene/"): | |
os.makedirs(datadir + dataset+"_input/loc2gene/") | |
joblib.dump(loc2gene, datadir + dataset+ | |
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl", protocol=2) | |
for ibatch in range(nbatches): | |
loc2gene_=loc2gene[ibatch::nbatches] | |
joblib.dump(loc2gene_, datadir + dataset+ | |
"_input/loc2gene/loc2gene."+ | |
str(dists[d])+"."+str(ibatch)+".pkl", protocol=2) | |
for d in range(len(dists)-1,0,-1): | |
for dataset in ["train","test","ladder"]: | |
loc2gene_prev=joblib.load(datadir + dataset+ | |
"_input/loc2gene/loc2gene."+str(dists[d-1])+".pkl") | |
loc2gene=joblib.load(datadir + dataset+ | |
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl") | |
# | |
for i in range(len(loc2gene)): | |
loc2gene[i]=list(set(loc2gene[i])-set(loc2gene_prev[i])) | |
# | |
joblib.dump(loc2gene, datadir + dataset+ | |
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl", protocol=2) | |
for ibatch in range(nbatches): | |
loc2gene_=loc2gene[ibatch::nbatches] | |
joblib.dump(loc2gene_, datadir + dataset+ | |
"_input/loc2gene/loc2gene."+ | |
str(dists[d])+"."+str(ibatch)+".pkl", protocol=2) | |
totalrna_=np.mean(totalrna,axis=2) | |
#metaData.maxneighbours=26 | |
for dataset in ["train","ladder","test"]: | |
# loc2gene=[] | |
#for dataset in ["test"]: | |
loc2gene=joblib.load(datadir + dataset+ | |
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl") | |
neighbourrna=np.zeros((len(loc2gene),len(dists), | |
len(cell2i),metaData.maxneighbours),dtype="float32") | |
for d in range(len(dists)): | |
loc2gene=joblib.load(datadir + dataset+ | |
"_input/loc2gene/loc2gene."+str(dists[d])+".pkl") | |
#loc2gene=joblib.load(datadir + dataset+ | |
# "_input/loc2gene/loc2gene."+str(dists[d])+".pkl") | |
for i in range(len(loc2gene)): | |
if len(loc2gene[i])>0: | |
neighbourrna[i,d,:,:min(len(loc2gene[i]), | |
metaData.maxneighbours)]=totalrna_[:min(len(loc2gene[i]),metaData.maxneighbours),:].T | |
# | |
if not os.path.exists( datadir + dataset+"_input/rna"): | |
os.makedirs(datadir + dataset+"_input/rna/",mode=0755) | |
joblib.dump(neighbourrna,datadir+dataset+ | |
"_input/rna/neighbouring_rnaseq.pkl",protocol=2) | |
# | |
dists=metaData.dists | |
gene2chip=np.empty([ngenes,len(cell2i),len(tf2i),len(dists)],dtype="int8") | |
dataset="train" | |
if not os.path.exists( datadir + dataset+"_input/gene2chip"): | |
os.makedirs(datadir + dataset+"_input/gene2chip/",mode=0755) | |
def subt(x,y): | |
if x>0 and y>0: | |
return 0 | |
else: | |
return x | |
vsubt=np.vectorize(subt,otypes=[np.dtype("int8")]) | |
for kthres in threshold: | |
for d in range(len(dists)): | |
for ktf in tf2i: | |
for kcell in cell2i: | |
print("gene2chip processing " + kcell+":"+ktf+":"+kthres) | |
if not os.path.isfile( datadir+ "tmp/gene2chip/"+kcell+"."+ktf+"."+str(dists[d])+"."+kthres+".txt"): | |
gene2chip[:,cell2i[kcell],tf2i[ktf],d]=-1 | |
else: | |
with open(datadir+"tmp/gene2chip/"+kcell+"."+ktf+"."+str(dists[d])+"."+kthres+".txt") as tsv: | |
i=0 | |
for line in csv.reader(tsv, delimiter="\t"): | |
if line[0] in ['chr1','chr8','chr21']: | |
#held out chromosomes are devoid of chip-seq peaks | |
gene2chip[i,cell2i[kcell],tf2i[ktf],d]=-1 | |
else: | |
val=int(line[1]) | |
if val>0: | |
gene2chip[i,cell2i[kcell],tf2i[ktf],d]=1 | |
else: | |
gene2chip[i,cell2i[kcell],tf2i[ktf],d]=0 | |
i=i+1 | |
# | |
for d in range(len(dists)-1,0,-1): | |
gene2chip[:,:,:,d]=vsubt(gene2chip[:,:,:,d],gene2chip[:,:,:,d-1]) | |
# | |
joblib.dump(gene2chip, datadir + dataset+"_input/gene2chip/gene2chip."+kthres+".pkl", protocol=2) |