#include<iostream> #include<algorithm> #include<string.h> #include "dbseq.h" using namespace std; extern Param param; extern bit8_t alphabet[]; extern bit8_t rev_alphabet[]; /************/ RefSeq::RefSeq() { total_kmers=0; } ref_loc_t RefSeq::LoadNextSeq(ifstream &fin) { char ch[1000]; char c; string s; if(param.gz_ref) return 0; fin>>c; if(fin.eof()) return 0; string::iterator z=_seq.begin(); _length=0; fin>>_name; fin.getline(ch, 1000); while(!fin.eof()) { fin>>c; if(fin.eof()) break; fin.unget(); if(c=='>') break; fin>>s; if(_length+s.size()>=param.max_dbseq_size) { param.max_dbseq_size+=param.append_dbseq_size; _seq.resize(param.max_dbseq_size); z=_seq.begin()+_length; //cout<<"_seq size: "<<param.max_dbseq_size<<endl; } copy(s.begin(), s.end(), z); z+=s.size(); _length+=s.size(); } return _length; } ref_loc_t RefSeq::LoadNextSeq(igzstream &fin) { char ch[1000]; char c; string s; if(!param.gz_ref) return 0; fin>>c; if(fin.eof()) return 0; string::iterator z=_seq.begin(); _length=0; fin>>_name; fin.getline(ch, 1000); while(!fin.eof()) { fin>>c; if(fin.eof()) break; fin.unget(); if(c=='>') break; fin>>s; if(_length+s.size()>=param.max_dbseq_size) { param.max_dbseq_size+=param.append_dbseq_size; _seq.resize(param.max_dbseq_size); z=_seq.begin()+_length; //cout<<"_seq size: "<<param.max_dbseq_size<<endl; } copy(s.begin(), s.end(), z); z+=s.size(); _length+=s.size(); } return _length; } void RefSeq::BinSeq(OneBfa &a) { a.n=(_length+(SEGLEN-1))/SEGLEN+BINSEQPAD; //SEGLENbp, bit(SEGLEN*2) for each element. put 2 extra elements at the 3'end to invoid overflow bit32_t t=a.n*SEGLEN-_length; if(t) { string ts(t, 'N'); if(_seq.size()<_length+t) _seq.resize(_length+t); copy(ts.begin(), ts.end(), _seq.begin()+_length); } a.s = new bit64_t[a.n]; string::iterator p=_seq.begin(); bit32_t i,j; for(i=0; i<a.n; i++,p+=SEGLEN) { a.s[i]=0; for(j=0; j<SEGLEN; j++) { a.s[i]<<=2; a.s[i]|=alphabet[(unsigned char)*(p+j)]; //cout << *(p+j); } } //cout <<endl; } void RefSeq::cBinSeq(OneBfa &a) { a.n=(_length+(SEGLEN-1))/SEGLEN+BINSEQPAD; //SEGLENbp, bit(SEGLEN*2) for each element. put 2 extra elements at the 3'end to invoid overflow /* int t=a.n*SEGLEN-_length; cout << _length<< " "<<a.n << " t "<<t<<endl; if(t) { string ts(t, 'N'); if(_seq.size()<_length+t) _seq.resize(_length+t); copy(ts.begin(), ts.end(), _seq.begin()+_length); } */ a.s = new bit64_t[a.n]; string::iterator p=_seq.begin()+a.n*SEGLEN-1; string::iterator tmp; for(bit32_t i=0;i<a.n; i++,p-=SEGLEN) { a.s[i]=0; for(bit32_t j=0; j<SEGLEN; j++) { a.s[i]<<=2; a.s[i]|=rev_alphabet[(unsigned char)*(p-j)]; //cout <<*(p-j); } //cout << endl; //cout << StrSeed2((unsigned int)a.s[i].a, SEGLEN); } //cout <<endl; } void RefSeq::UnmaskRegion() { Block b, cb; b.id=_count; cb.id=_count+1; int total_len=((_length+(SEGLEN-1))/SEGLEN+BINSEQPAD)*SEGLEN; b.begin=b.end=0; // bit32_t total_size=0; while(b.end<_length) { b.begin=_seq.find_first_of(param.useful_nt, b.end); if(b.begin > _length) break; b.end=_seq.find_first_of(param.nx_nt, b.begin); b.end = (b.end<=_length? b.end : _length); if(b.end-b.begin <30) continue; if((!_blocks.empty()) && (b.id==_blocks[_blocks.size()-1].id) && (b.begin - _blocks[_blocks.size()-1].end <5)) _blocks[_blocks.size()-1].end=b.end; else { _blocks.push_back(b); //added by yxi cb.begin=total_len-b.end; cb.end=total_len-b.begin; _blocks.push_back(cb); //cout <<"bid "<<b.id<<" "<<b.begin<<" "<<b.end<<endl; //cout <<"cbid "<<cb.id<<" "<<cb.begin<<" "<<cb.end<<endl; } } } void RefSeq::find_CCGG() { bit32_t tmp_offset,tmp_max, seedloc, right=0xffffffffu, seglen; int i,j; vector<ref_loc_t> tmpset_index_BSW[50], tmpset_index_BSC[50]; vector<pair<ref_loc_t, bit32_t> > tmp_CCGG_sites; //vector<pair<ref_loc_t, bit32_t> >::iterator rit; std::transform(_seq.begin(),_seq.end(),_seq.begin(),static_cast < int(*)(int) > (toupper)); tmp_offset=title[_count-1].rc_offset-param.seed_size; tmp_max=title[_count-1].size-param.seed_size; for(j=0;j<param.digest_site.size();j++) { bit32_t min_offset=min(param.digest_pos[j], (bit32_t) param.digest_site[j].size()-param.digest_pos[j]); bit32_t rev_offset=param.digest_site[j].size()-2*min_offset; right=_seq.find(param.digest_site[j],1); while(right<_length) { n_CCGG++; tmp_CCGG_sites.push_back(make_pair(right+min_offset,rev_offset)); right=_seq.find(param.digest_site[j],right+1); } } sort(tmp_CCGG_sites.begin(), tmp_CCGG_sites.end()); CCGG_sites.push_back(tmp_CCGG_sites); for(i=0;i<param.max_seedseg_num;i++) { tmpset_index_BSW[i].reserve(n_CCGG); tmpset_index_BSC[i].reserve(n_CCGG); } for(j=0;j+1<tmp_CCGG_sites.size();++j) { //for(rit=tmp_CCGG_sites.begin();rit!=tmp_CCGG_sites.end()-1&&tmp_CCGG_sites.size()>1;++rit) { for(i=j+1,seglen=0;i<tmp_CCGG_sites.size();++i) if((seglen=tmp_CCGG_sites[i].first+tmp_CCGG_sites[i].second-tmp_CCGG_sites[j].first)>=param.min_insert) break; if(seglen>param.max_insert||seglen<param.min_insert) continue; for(i=0,seedloc=tmp_CCGG_sites[j].first;i<param.max_seedseg_num&&seedloc<=tmp_max;i++,seedloc+=param.seed_size) tmpset_index_BSW[i].push_back(seedloc); } for(j=1;j<tmp_CCGG_sites.size();++j) { //for(rit=tmp_CCGG_sites.begin()+1;rit!=tmp_CCGG_sites.end()&&tmp_CCGG_sites.size()>1;++rit) { for(i=j-1,seglen=0;i>=0;--i) if((seglen=tmp_CCGG_sites[j].first+tmp_CCGG_sites[j].second-tmp_CCGG_sites[i].first)>=param.min_insert) break; if(seglen>param.max_insert||seglen<param.min_insert) continue; for(i=0,seedloc=tmp_CCGG_sites[j].first+tmp_CCGG_sites[j].second-param.seed_size;i<param.max_seedseg_num&&seedloc>=0;i++,seedloc-=param.seed_size) tmpset_index_BSC[i].push_back(tmp_offset-seedloc); } for(i=0;i<param.max_seedseg_num;i++) { CCGG_index[i].push_back(tmpset_index_BSW[i]); CCGG_index[i].push_back(tmpset_index_BSC[i]); } } bool BlockComp(Block a, Block b) {return (a.id<b.id)||((a.id==b.id)&&(a.begin<b.begin));} void RefSeq::Run_ConvertBinseq(ifstream &fin, igzstream &gzfin) { param.max_seedseg_num=(FIXELEMENT-1)*SEGLEN/param.seed_size; _seq.resize(param.max_dbseq_size); RefTitle r; _count=0; total_num=sum_length=0; while(LoadNextSeq(fin)||LoadNextSeq(gzfin)) { r.name=_name; r.size=_length; r.rc_offset=((_length+(SEGLEN-1))/SEGLEN+BINSEQPAD)*SEGLEN; title.push_back(r); OneBfa a; BinSeq(a); bfa.push_back(a); UnmaskRegion(); _count++; total_num++; sum_length+=_length; // cout<<r.size<<endl; //added by yxi, RC reference seq title.push_back(r); OneBfa ca; cBinSeq(ca); bfa.push_back(ca); //UnmaskRegion(); _count++; //total_num++; //sum_length+=_length; //cout<<"ref:"<<r.name<<endl; if(param.RRBS_flag) find_CCGG(); } sort(_blocks.begin(),_blocks.end(),BlockComp); param.total_ref_seq=total_num; bit32_t i, s=0; bit64_t *ptr, *cptr; ref_anchor.push_back(REF_MARGIN*SEGLEN); for(i=0;i<total_num;i++) { s+=bfa[i*2].n; ref_anchor.push_back((s+REF_MARGIN)*SEGLEN); } //for(i=0;i<=total_num;i++) cout<<"i="<<i<<" "<<ref_anchor[i]<<" "<<endl; refcat=new bit64_t[s+REF_MARGIN*2]; crefcat=new bit64_t[s+REF_MARGIN*2]; xref[0]=refcat; xref[1]=crefcat; ptr=refcat+REF_MARGIN; cptr=crefcat+REF_MARGIN; for(i=0;i<_count;i++) { if(i%2==0) { ptr=copy(bfa[i].s, bfa[i].s+bfa[i].n, ptr); delete [] bfa[i].s; bfa[i].s=ptr-bfa[i].n; } else { cptr=copy(bfa[i].s, bfa[i].s+bfa[i].n, cptr); delete [] bfa[i].s; bfa[i].s=cptr-bfa[i].n; } } // cout<<"total seq length: "<<sum_length<<endl; // cout<<"total "<<ccgg_seglen.size()<<" CCGG sites\n"; // for(map<shortHit,bit32_t,shorthitcompclass>::iterator it=ccgg_seglen.begin();it!=ccgg_seglen.end();it++) // cout<<"chr:"<<(int)(it->first).chr<<" pos:"<<(it->first).loc<<" => "<<(it->second)<<endl; _seq.clear(); //free ram } bit32_t RefSeq::s_MakeSeed_1(bit64_t *_m, int _a) { return param.XT(((_m[0]<<(_a*2))|((_m[1]>>1)>>(63-_a*2)))>>param.seed_bits_lz); //if(_a>=0) return param.XT((*_m>>_a)¶m.seed_bits); //else return param.XT(((*_m<<-_a)|(*(_m+1)>>(SEGLEN*2+_a)))¶m.seed_bits); //return param.XT(((((bit64_t)*_m<<32)|*(_m+1))>>_a)¶m.seed_bits); } void RefSeq::InitialIndex() { bit32_t i; total_kmers=1; KmerLoc *p; KmerLoc2 *p2; for(i=0;i<param.seed_size;i++) total_kmers*=3; if(param.RRBS_flag) { index= new KmerLoc[total_kmers]; for(i=0,p=index; i<total_kmers; p++,i++) p->n1=0; } else { index2=new KmerLoc2[total_kmers]; for(i=0,p2=index2;i<total_kmers;i++,p2++) p2->n[0]=p2->n[1]=0; } } void RefSeq::CalKmerFreq() { bit32_t chr, chr1, i, j, tmp_offset; bit64_t *_m; vector<ref_loc_t>::iterator it; if(param.RRBS_flag){ for(j=0;j<param.max_seedseg_num;j++){ for(chr=0;chr<_count;chr++){ //cout<<"chr:"<<chr<<" "<<title[chr].name<<" size:"<<title[chr].size<<" loc:"<<CCGG_index[j][chr].size()<<endl; _m=bfa[chr].s; for(it=CCGG_index[j][chr].begin(),i=0;it!=CCGG_index[j][chr].end();++it,++i){ index[s_MakeSeed_1(_m+(*it)/SEGLEN,(*it)%SEGLEN)].n1++; } if(param.pairend||param.chains){ chr1=chr^1; tmp_offset=title[chr].rc_offset-param.seed_size; for(it=CCGG_index[j][chr1].begin();it!=CCGG_index[j][chr1].end();++it){ if(tmp_offset<*it) continue; index[s_MakeSeed_1(_m+(int)(tmp_offset-*it)/SEGLEN,(tmp_offset-*it)%SEGLEN)].n1++; } } } } } else{ t_CalKmerFreq(0); t_CalKmerFreq(1); } } void RefSeq::t_CalKmerFreq(bit32_t ref_chain) { bit64_t *_m; bit32_t i,j,i2, prefetch, ptr=0; bit32_t dbs[PREFETCH_CAL_UNIT]; prefetch=PREFETCH_CAL_UNIT*param.index_interval; for(vector<Block>::iterator p=_blocks.begin(); p!=_blocks.end(); p++) { if(p->id%2!=ref_chain) continue; //cout<<"seg:"<<p->begin<<" "<<p->end<<endl; _m=bfa[p->id].s; i2=((p->end-param.seed_size)/param.index_interval)*param.index_interval; for(j=0,i=(p->begin/param.index_interval)*param.index_interval; j<PREFETCH_CAL_UNIT; i+=param.index_interval,++j,++ptr) { //cout<<"ptr="<<ptr<<" ptr_mod="<<(ptr%PREFETCH_CAL_UNIT)<<" db:"<<s_MakeSeed_1(_m+i/SEGLEN,i%SEGLEN)<<endl; dbs[ptr%PREFETCH_CAL_UNIT]=s_MakeSeed_1(_m+i/SEGLEN,i%SEGLEN); } //cout<<"chain="<<ref_chain; for(jj=0;jj<PREFETCH_CAL_UNIT;jj++) cout<<" db["<<jj<<"]="<<dbs[jj]; cout<<endl; for(i=(p->begin/param.index_interval)*param.index_interval; i<=i2; i+=param.index_interval,++ptr) { index2[dbs[ptr%PREFETCH_CAL_UNIT]].n[ref_chain]++; j=s_MakeSeed_1(_m+(i+prefetch)/SEGLEN,(i+prefetch)%SEGLEN); dbs[ptr%PREFETCH_CAL_UNIT]=j; __builtin_prefetch(index2+j,1,0); //cout<<"i="<<i<<" ptr="<<ptr<<" ptr_mod="<<(ptr%PREFETCH_CAL_UNIT)<<" current="<<dbs[ptr%PREFETCH_CAL_UNIT]<<" prefetch="<<j<<endl; //for(jj=-PREFETCH_CAL_UNIT;jj<PREFETCH_CAL_UNIT*4;jj++) cout<<" "<<jj<<":"<<dbs[jj]; cout<<endl; } } } void RefSeq::AllocIndex() { KmerLoc *v; KmerLoc2 *u; bit32_t i, j, t, *ptr, block_size=1<<22, ptr_count=0; if(param.RRBS_flag){ for(v=index,i=0; i<total_kmers; v++,i++) { if(v->n1>0) { v->loc1= new Hit[v->n1]; v->n1=0; } } } else { bit32_t *kmer_count=new bit32_t[total_kmers]; ptr = new bit32_t[block_size]; // 64MB mem_pool.push_back(ptr); for(i=0,u=index2; i<total_kmers; i++,u++) { t=u->n[1]+u->n[0]; kmer_count[i]=t; if(t==0) continue; if(ptr_count+t>=block_size-PREFETCH_LOOP) { if(t>=block_size-PREFETCH_LOOP) { u->loc1= new bit32_t[t+PREFETCH_LOOP]; mem_pool.push_back(u->loc1); for(j=0;j<PREFETCH_LOOP;j++) (u->loc1)[t+j]=0; u->n[1]=0; continue; } for(j=0;j<PREFETCH_LOOP;j++) ptr[ptr_count+j]=0; ptr = new bit32_t[block_size]; mem_pool.push_back(ptr); ptr_count=0; } u->loc1=ptr+ptr_count; ptr_count+=t; u->n[1]=0; } for(j=0;j<PREFETCH_LOOP;j++) ptr[ptr_count+j]=0; sort(kmer_count, kmer_count+total_kmers-1); param.max_kmer_num=kmer_count[(bit32_t)(total_kmers*(1-param.max_kmer_ratio))-1]; //cout<<"kmer_threshold:"<<param.max_kmer_num<<" kmer_ratio:"<<param.max_kmer_ratio<<endl; delete [] kmer_count; } } void RefSeq::ReleaseIndex() { bit32_t i; KmerLoc *v; KmerLoc2 *u; if(param.RRBS_flag){ for(i=0,v=index; i<total_kmers; i++,v++) { if(v->n1) delete [] v->loc1; } delete [] index; } else { while(!mem_pool.empty()) { delete [] mem_pool.back(); mem_pool.pop_back(); } delete [] index2; } } void RefSeq::FillIndex() { bit64_t *_m; bit32_t chr, chr1, j,tmp_offset; KmerLoc *z; Hit tmphit; vector<ref_loc_t>::iterator it; if(param.RRBS_flag) { for(j=0;j<param.max_seedseg_num;j++) { for(chr=0;chr<_count;chr++) { tmphit.chr=chr|(j<<16); _m=bfa[chr].s; for(it=CCGG_index[j][chr].begin();it!=CCGG_index[j][chr].end();++it) { z=index+s_MakeSeed_1(_m+(*it)/SEGLEN,(*it)%SEGLEN); tmphit.loc=(*it); z->loc1[z->n1++]=tmphit; } if(param.pairend||param.chains) { chr1=chr^1; tmp_offset=title[chr].rc_offset-param.seed_size; tmphit.chr=chr|(j<<16)|0x1000000; for(it=CCGG_index[j][chr1].begin();it!=CCGG_index[j][chr1].end();++it) { if(tmp_offset<*it) continue; z=index+s_MakeSeed_1(_m+(int)(tmp_offset-*it)/SEGLEN,(tmp_offset-*it)%SEGLEN); tmphit.loc=(tmp_offset-*it); z->loc1[z->n1++]=tmphit; } } } } } else { t_FillIndex(0); t_FillIndex(1); } //_a=0; _e=0; for(z=index,i=0; i<total_kmers; z++,i++) if(z->n1) {_a+=z->n1; _e++;} cout<<"index total:"<<_a<<" keys:"<<_e<<endl; //_a=0; _e=0; for(z=cindex, i=0; i<total_kmers; z++,i++) if(z->n1) {_a+=z->n1; _e++;} cout<<"cindex total:"<<_a<<" keys:"<<_e<<endl; /* for(ref_id_t chr=0;chr<_count;chr++){ cout<<"chr"<<(int)chr<<endl; for(set<ref_loc_t>::iterator it=CCGG_index[chr].begin();it!=CCGG_index[chr].end();it++) cout<<" "<<*it; cout<<endl; for(set<ref_loc_t>::iterator it=CCGG_cindex[chr].begin();it!=CCGG_cindex[chr].end();it++) cout<<" "<<*it; cout<<endl; } */ } void RefSeq::t_FillIndex(bit32_t ref_chain) { KmerLoc2 *z2; Hit tmphit; bit64_t *_m; bit32_t i2,j,i,prefetch, tmp=1-ref_chain, ptr=0; bit32_t dbs[PREFETCH_CRT_UNIT]; prefetch=PREFETCH_CRT_UNIT*param.index_interval; for(vector<Block>::iterator p=_blocks.begin(); p!=_blocks.end(); p++) { if(p->id%2!=ref_chain) continue; tmphit.chr=p->id; _m=bfa[tmphit.chr].s; i2=((p->end-param.seed_size)/param.index_interval)*param.index_interval; for(j=0,i=(p->begin/param.index_interval)*param.index_interval; j<PREFETCH_CRT_UNIT; i+=param.index_interval,++j,++ptr) { dbs[ptr%PREFETCH_CRT_UNIT]=s_MakeSeed_1(_m+i/SEGLEN,i%SEGLEN); } for(tmphit.loc=(p->begin/param.index_interval)*param.index_interval; tmphit.loc<=i2; tmphit.loc+=param.index_interval,++ptr) { z2=index2+dbs[ptr%PREFETCH_CRT_UNIT]; z2->loc1[z2->n[tmp]++]=hit2int(tmphit); j=s_MakeSeed_1(_m+(tmphit.loc+prefetch)/SEGLEN,(tmphit.loc+prefetch)%SEGLEN); dbs[ptr%PREFETCH_CRT_UNIT]=j; __builtin_prefetch(index2+j,1,0); } } } void RefSeq::CreateIndex() { bit32_t i; InitialIndex(); CalKmerFreq(); AllocIndex(); FillIndex(); FinishIndex(); } void RefSeq::FinishIndex() { bit32_t i; if(param.RRBS_flag) for(i=0;i<param.max_seedseg_num;i++) CCGG_index[i].clear(); /* for(i=0;i<total_kmers;i++) { cout<<"seed:"<<i<<" total:"<<index2[i][0]<<endl; for(j=1; j<=index2[i][0]; j++) cout<<index2[i][j]<<" "; cout<<endl; //for(k=0;k<4;k++) { while(true){ cout<<"c"<<(index2[i][j]>>24)<<":"<<(index2[i][j]&0xffffff)<<" "; if((index2[i][j]&0xffffff)==index2[i][0]) break; j++; } cout<<endl; } */ } pair<ref_loc_t,bit32_t> RefSeq::CCGG_seglen(ref_id_t chr, ref_loc_t pos, int readlen) { ref_id_t chr2=chr/2; bit32_t left, right, mid; ref_loc_t midval,seg_start, seg_end; CCGG_sites_chr=&CCGG_sites[chr2]; left=0; //right=CCGG_sites[chr2].size()-1; right=CCGG_sites_chr->size()-1; while(left<right-1) { mid=(left+right)/2; //cout<<"pos:"<<pos<<"\tleft:"<<left<<" "<<CCGG_sites[chr2][left]<<"\tright:"<<right<<" "<<CCGG_sites[chr2][right]<<"\tmid:"<<mid<<" "<<CCGG_sites[chr2][mid]<<endl; if((midval=(*CCGG_sites_chr)[mid].first)==pos) {//return pair<ref_loc_t,int>(pos+1,CCGG_sites[chr2][mid+1]-midval+2); left=mid;right=mid+1;break; } else if(midval<pos) left=mid; //if(CCGG_sites[chr2][mid]<=pos) left=mid; else right=mid; } //cout<<"endloop\n"; //cout<<"pos:"<<pos<<"\tleft:"<<left<<" "<<CCGG_sites[chr2][left]<<"\tright:"<<right<<" "<<CCGG_sites[chr2][right]<<"\tmid:"<<mid<<" "<<CCGG_sites[chr2][mid]<<endl; //for (mid=left-2;mid<left+3;mid++) cout<<"\t"<<CCGG_sites[chr2][mid]; cout<<endl; seg_start=(*CCGG_sites_chr)[left].first; while(((seg_end=(*CCGG_sites_chr)[right].first+(*CCGG_sites_chr)[right].second)<pos+readlen)&&(right<CCGG_sites_chr->size())) right++; //cout<<"pos:"<<pos<<"\tleft:"<<left<<" "<<CCGG_sites[chr2][left]<<"\tright:"<<right<<" "<<CCGG_sites[chr2][right]<<"\tmid:"<<mid<<" "<<CCGG_sites[chr2][mid]<<endl;} //cout<<"ZP:"<<(seg_start+1)<<" ZL:"<<(seg_end-seg_start)<<endl; //return pair<ref_loc_t,int>(CCGG_sites[chr2][left]+1, CCGG_sites[chr2][right]-CCGG_sites[chr2][left]+2); return pair<ref_loc_t,bit32_t>(seg_start+1, seg_end-seg_start); } ref_loc_t RefSeq::hit2int(Hit h) { return ref_anchor[h.chr/2]+h.loc; }