Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
bsmap/dbseq.cpp
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
523 lines (478 sloc)
18.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<algorithm> | |
#include<string.h> | |
#include "dbseq.h" | |
using namespace std; | |
extern Param param; | |
extern bit8_t alphabet[]; | |
extern bit8_t rev_alphabet[]; | |
/************/ | |
RefSeq::RefSeq() { | |
total_kmers=0; | |
} | |
ref_loc_t RefSeq::LoadNextSeq(ifstream &fin) { | |
char ch[1000]; char c; string s; | |
if(param.gz_ref) return 0; | |
fin>>c; if(fin.eof()) return 0; | |
string::iterator z=_seq.begin(); | |
_length=0; | |
fin>>_name; fin.getline(ch, 1000); | |
while(!fin.eof()) { | |
fin>>c; if(fin.eof()) break; | |
fin.unget(); if(c=='>') break; | |
fin>>s; | |
if(_length+s.size()>=param.max_dbseq_size) { | |
param.max_dbseq_size+=param.append_dbseq_size; | |
_seq.resize(param.max_dbseq_size); | |
z=_seq.begin()+_length; | |
//cout<<"_seq size: "<<param.max_dbseq_size<<endl; | |
} | |
copy(s.begin(), s.end(), z); | |
z+=s.size(); _length+=s.size(); | |
} | |
return _length; | |
} | |
ref_loc_t RefSeq::LoadNextSeq(igzstream &fin) { | |
char ch[1000]; char c; string s; | |
if(!param.gz_ref) return 0; | |
fin>>c; if(fin.eof()) return 0; | |
string::iterator z=_seq.begin(); | |
_length=0; | |
fin>>_name; fin.getline(ch, 1000); | |
while(!fin.eof()) { | |
fin>>c; if(fin.eof()) break; | |
fin.unget(); if(c=='>') break; | |
fin>>s; | |
if(_length+s.size()>=param.max_dbseq_size) { | |
param.max_dbseq_size+=param.append_dbseq_size; | |
_seq.resize(param.max_dbseq_size); | |
z=_seq.begin()+_length; | |
//cout<<"_seq size: "<<param.max_dbseq_size<<endl; | |
} | |
copy(s.begin(), s.end(), z); | |
z+=s.size(); _length+=s.size(); | |
} | |
return _length; | |
} | |
void RefSeq::BinSeq(OneBfa &a) { | |
a.n=(_length+(SEGLEN-1))/SEGLEN+BINSEQPAD; //SEGLENbp, bit(SEGLEN*2) for each element. put 2 extra elements at the 3'end to invoid overflow | |
bit32_t t=a.n*SEGLEN-_length; | |
if(t) { | |
string ts(t, 'N'); | |
if(_seq.size()<_length+t) _seq.resize(_length+t); | |
copy(ts.begin(), ts.end(), _seq.begin()+_length); | |
} | |
a.s = new bit64_t[a.n]; | |
string::iterator p=_seq.begin(); | |
bit32_t i,j; | |
for(i=0; i<a.n; i++,p+=SEGLEN) { | |
a.s[i]=0; | |
for(j=0; j<SEGLEN; j++) { | |
a.s[i]<<=2; | |
a.s[i]|=alphabet[(unsigned char)*(p+j)]; | |
//cout << *(p+j); | |
} | |
} | |
//cout <<endl; | |
} | |
void RefSeq::cBinSeq(OneBfa &a) { | |
a.n=(_length+(SEGLEN-1))/SEGLEN+BINSEQPAD; //SEGLENbp, bit(SEGLEN*2) for each element. put 2 extra elements at the 3'end to invoid overflow | |
/* int t=a.n*SEGLEN-_length; | |
cout << _length<< " "<<a.n << " t "<<t<<endl; | |
if(t) { | |
string ts(t, 'N'); | |
if(_seq.size()<_length+t) _seq.resize(_length+t); | |
copy(ts.begin(), ts.end(), _seq.begin()+_length); | |
} | |
*/ | |
a.s = new bit64_t[a.n]; | |
string::iterator p=_seq.begin()+a.n*SEGLEN-1; | |
string::iterator tmp; | |
for(bit32_t i=0;i<a.n; i++,p-=SEGLEN) { | |
a.s[i]=0; | |
for(bit32_t j=0; j<SEGLEN; j++) { | |
a.s[i]<<=2; | |
a.s[i]|=rev_alphabet[(unsigned char)*(p-j)]; | |
//cout <<*(p-j); | |
} | |
//cout << endl; | |
//cout << StrSeed2((unsigned int)a.s[i].a, SEGLEN); | |
} | |
//cout <<endl; | |
} | |
void RefSeq::UnmaskRegion() { | |
Block b, cb; | |
b.id=_count; | |
cb.id=_count+1; | |
int total_len=((_length+(SEGLEN-1))/SEGLEN+BINSEQPAD)*SEGLEN; | |
b.begin=b.end=0; | |
// bit32_t total_size=0; | |
while(b.end<_length) { | |
b.begin=_seq.find_first_of(param.useful_nt, b.end); | |
if(b.begin > _length) break; | |
b.end=_seq.find_first_of(param.nx_nt, b.begin); | |
b.end = (b.end<=_length? b.end : _length); | |
if(b.end-b.begin <30) continue; | |
if((!_blocks.empty()) && (b.id==_blocks[_blocks.size()-1].id) | |
&& (b.begin - _blocks[_blocks.size()-1].end <5)) | |
_blocks[_blocks.size()-1].end=b.end; | |
else { | |
_blocks.push_back(b); | |
//added by yxi | |
cb.begin=total_len-b.end; | |
cb.end=total_len-b.begin; | |
_blocks.push_back(cb); | |
//cout <<"bid "<<b.id<<" "<<b.begin<<" "<<b.end<<endl; | |
//cout <<"cbid "<<cb.id<<" "<<cb.begin<<" "<<cb.end<<endl; | |
} | |
} | |
} | |
void RefSeq::find_CCGG() { | |
bit32_t tmp_offset,tmp_max, seedloc, right=0xffffffffu, seglen; int i,j; | |
vector<ref_loc_t> tmpset_index_BSW[50], tmpset_index_BSC[50]; | |
vector<pair<ref_loc_t, bit32_t> > tmp_CCGG_sites; | |
//vector<pair<ref_loc_t, bit32_t> >::iterator rit; | |
std::transform(_seq.begin(),_seq.end(),_seq.begin(),static_cast < int(*)(int) > (toupper)); | |
tmp_offset=title[_count-1].rc_offset-param.seed_size; | |
tmp_max=title[_count-1].size-param.seed_size; | |
for(j=0;j<param.digest_site.size();j++) { | |
bit32_t min_offset=min(param.digest_pos[j], (bit32_t) param.digest_site[j].size()-param.digest_pos[j]); | |
bit32_t rev_offset=param.digest_site[j].size()-2*min_offset; | |
right=_seq.find(param.digest_site[j],1); | |
while(right<_length) { | |
n_CCGG++; | |
tmp_CCGG_sites.push_back(make_pair(right+min_offset,rev_offset)); | |
right=_seq.find(param.digest_site[j],right+1); | |
} | |
} | |
sort(tmp_CCGG_sites.begin(), tmp_CCGG_sites.end()); | |
CCGG_sites.push_back(tmp_CCGG_sites); | |
for(i=0;i<param.max_seedseg_num;i++) { | |
tmpset_index_BSW[i].reserve(n_CCGG); | |
tmpset_index_BSC[i].reserve(n_CCGG); | |
} | |
for(j=0;j+1<tmp_CCGG_sites.size();++j) { | |
//for(rit=tmp_CCGG_sites.begin();rit!=tmp_CCGG_sites.end()-1&&tmp_CCGG_sites.size()>1;++rit) { | |
for(i=j+1,seglen=0;i<tmp_CCGG_sites.size();++i) | |
if((seglen=tmp_CCGG_sites[i].first+tmp_CCGG_sites[i].second-tmp_CCGG_sites[j].first)>=param.min_insert) break; | |
if(seglen>param.max_insert||seglen<param.min_insert) continue; | |
for(i=0,seedloc=tmp_CCGG_sites[j].first;i<param.max_seedseg_num&&seedloc<=tmp_max;i++,seedloc+=param.seed_size) | |
tmpset_index_BSW[i].push_back(seedloc); | |
} | |
for(j=1;j<tmp_CCGG_sites.size();++j) { | |
//for(rit=tmp_CCGG_sites.begin()+1;rit!=tmp_CCGG_sites.end()&&tmp_CCGG_sites.size()>1;++rit) { | |
for(i=j-1,seglen=0;i>=0;--i) | |
if((seglen=tmp_CCGG_sites[j].first+tmp_CCGG_sites[j].second-tmp_CCGG_sites[i].first)>=param.min_insert) break; | |
if(seglen>param.max_insert||seglen<param.min_insert) continue; | |
for(i=0,seedloc=tmp_CCGG_sites[j].first+tmp_CCGG_sites[j].second-param.seed_size;i<param.max_seedseg_num&&seedloc>=0;i++,seedloc-=param.seed_size) | |
tmpset_index_BSC[i].push_back(tmp_offset-seedloc); | |
} | |
for(i=0;i<param.max_seedseg_num;i++) { | |
CCGG_index[i].push_back(tmpset_index_BSW[i]); | |
CCGG_index[i].push_back(tmpset_index_BSC[i]); | |
} | |
} | |
bool BlockComp(Block a, Block b) {return (a.id<b.id)||((a.id==b.id)&&(a.begin<b.begin));} | |
void RefSeq::Run_ConvertBinseq(ifstream &fin, igzstream &gzfin) { | |
param.max_seedseg_num=(FIXELEMENT-1)*SEGLEN/param.seed_size; | |
_seq.resize(param.max_dbseq_size); | |
RefTitle r; | |
_count=0; | |
total_num=sum_length=0; | |
while(LoadNextSeq(fin)||LoadNextSeq(gzfin)) { | |
r.name=_name; | |
r.size=_length; | |
r.rc_offset=((_length+(SEGLEN-1))/SEGLEN+BINSEQPAD)*SEGLEN; | |
title.push_back(r); | |
OneBfa a; | |
BinSeq(a); | |
bfa.push_back(a); | |
UnmaskRegion(); | |
_count++; | |
total_num++; | |
sum_length+=_length; | |
// cout<<r.size<<endl; | |
//added by yxi, RC reference seq | |
title.push_back(r); | |
OneBfa ca; | |
cBinSeq(ca); | |
bfa.push_back(ca); | |
//UnmaskRegion(); | |
_count++; | |
//total_num++; | |
//sum_length+=_length; | |
//cout<<"ref:"<<r.name<<endl; | |
if(param.RRBS_flag) find_CCGG(); | |
} | |
sort(_blocks.begin(),_blocks.end(),BlockComp); | |
param.total_ref_seq=total_num; | |
bit32_t i, s=0; bit64_t *ptr, *cptr; | |
ref_anchor.push_back(REF_MARGIN*SEGLEN); | |
for(i=0;i<total_num;i++) { | |
s+=bfa[i*2].n; ref_anchor.push_back((s+REF_MARGIN)*SEGLEN); | |
} | |
//for(i=0;i<=total_num;i++) cout<<"i="<<i<<" "<<ref_anchor[i]<<" "<<endl; | |
refcat=new bit64_t[s+REF_MARGIN*2]; crefcat=new bit64_t[s+REF_MARGIN*2]; | |
xref[0]=refcat; xref[1]=crefcat; | |
ptr=refcat+REF_MARGIN; cptr=crefcat+REF_MARGIN; | |
for(i=0;i<_count;i++) { | |
if(i%2==0) { | |
ptr=copy(bfa[i].s, bfa[i].s+bfa[i].n, ptr); | |
delete [] bfa[i].s; | |
bfa[i].s=ptr-bfa[i].n; | |
} | |
else { | |
cptr=copy(bfa[i].s, bfa[i].s+bfa[i].n, cptr); | |
delete [] bfa[i].s; | |
bfa[i].s=cptr-bfa[i].n; | |
} | |
} | |
// cout<<"total seq length: "<<sum_length<<endl; | |
// cout<<"total "<<ccgg_seglen.size()<<" CCGG sites\n"; | |
// for(map<shortHit,bit32_t,shorthitcompclass>::iterator it=ccgg_seglen.begin();it!=ccgg_seglen.end();it++) | |
// cout<<"chr:"<<(int)(it->first).chr<<" pos:"<<(it->first).loc<<" => "<<(it->second)<<endl; | |
_seq.clear(); //free ram | |
} | |
bit32_t RefSeq::s_MakeSeed_1(bit64_t *_m, int _a) { | |
return param.XT(((_m[0]<<(_a*2))|((_m[1]>>1)>>(63-_a*2)))>>param.seed_bits_lz); | |
//if(_a>=0) return param.XT((*_m>>_a)¶m.seed_bits); | |
//else return param.XT(((*_m<<-_a)|(*(_m+1)>>(SEGLEN*2+_a)))¶m.seed_bits); | |
//return param.XT(((((bit64_t)*_m<<32)|*(_m+1))>>_a)¶m.seed_bits); | |
} | |
void RefSeq::InitialIndex() { | |
bit32_t i; total_kmers=1; | |
KmerLoc *p; KmerLoc2 *p2; | |
for(i=0;i<param.seed_size;i++) total_kmers*=3; | |
if(param.RRBS_flag) { | |
index= new KmerLoc[total_kmers]; | |
for(i=0,p=index; i<total_kmers; p++,i++) p->n1=0; | |
} | |
else { | |
index2=new KmerLoc2[total_kmers]; | |
for(i=0,p2=index2;i<total_kmers;i++,p2++) p2->n[0]=p2->n[1]=0; | |
} | |
} | |
void RefSeq::CalKmerFreq() { | |
bit32_t chr, chr1, i, j, tmp_offset; bit64_t *_m; | |
vector<ref_loc_t>::iterator it; | |
if(param.RRBS_flag){ | |
for(j=0;j<param.max_seedseg_num;j++){ | |
for(chr=0;chr<_count;chr++){ | |
//cout<<"chr:"<<chr<<" "<<title[chr].name<<" size:"<<title[chr].size<<" loc:"<<CCGG_index[j][chr].size()<<endl; | |
_m=bfa[chr].s; | |
for(it=CCGG_index[j][chr].begin(),i=0;it!=CCGG_index[j][chr].end();++it,++i){ | |
index[s_MakeSeed_1(_m+(*it)/SEGLEN,(*it)%SEGLEN)].n1++; | |
} | |
if(param.pairend||param.chains){ | |
chr1=chr^1; tmp_offset=title[chr].rc_offset-param.seed_size; | |
for(it=CCGG_index[j][chr1].begin();it!=CCGG_index[j][chr1].end();++it){ | |
if(tmp_offset<*it) continue; | |
index[s_MakeSeed_1(_m+(int)(tmp_offset-*it)/SEGLEN,(tmp_offset-*it)%SEGLEN)].n1++; | |
} | |
} | |
} | |
} | |
} | |
else{ | |
t_CalKmerFreq(0); | |
t_CalKmerFreq(1); | |
} | |
} | |
void RefSeq::t_CalKmerFreq(bit32_t ref_chain) { | |
bit64_t *_m; bit32_t i,j,i2, prefetch, ptr=0; | |
bit32_t dbs[PREFETCH_CAL_UNIT]; | |
prefetch=PREFETCH_CAL_UNIT*param.index_interval; | |
for(vector<Block>::iterator p=_blocks.begin(); p!=_blocks.end(); p++) { | |
if(p->id%2!=ref_chain) continue; | |
//cout<<"seg:"<<p->begin<<" "<<p->end<<endl; | |
_m=bfa[p->id].s; i2=((p->end-param.seed_size)/param.index_interval)*param.index_interval; | |
for(j=0,i=(p->begin/param.index_interval)*param.index_interval; j<PREFETCH_CAL_UNIT; i+=param.index_interval,++j,++ptr) { | |
//cout<<"ptr="<<ptr<<" ptr_mod="<<(ptr%PREFETCH_CAL_UNIT)<<" db:"<<s_MakeSeed_1(_m+i/SEGLEN,i%SEGLEN)<<endl; | |
dbs[ptr%PREFETCH_CAL_UNIT]=s_MakeSeed_1(_m+i/SEGLEN,i%SEGLEN); | |
} | |
//cout<<"chain="<<ref_chain; for(jj=0;jj<PREFETCH_CAL_UNIT;jj++) cout<<" db["<<jj<<"]="<<dbs[jj]; cout<<endl; | |
for(i=(p->begin/param.index_interval)*param.index_interval; i<=i2; i+=param.index_interval,++ptr) { | |
index2[dbs[ptr%PREFETCH_CAL_UNIT]].n[ref_chain]++; | |
j=s_MakeSeed_1(_m+(i+prefetch)/SEGLEN,(i+prefetch)%SEGLEN); | |
dbs[ptr%PREFETCH_CAL_UNIT]=j; | |
__builtin_prefetch(index2+j,1,0); | |
//cout<<"i="<<i<<" ptr="<<ptr<<" ptr_mod="<<(ptr%PREFETCH_CAL_UNIT)<<" current="<<dbs[ptr%PREFETCH_CAL_UNIT]<<" prefetch="<<j<<endl; | |
//for(jj=-PREFETCH_CAL_UNIT;jj<PREFETCH_CAL_UNIT*4;jj++) cout<<" "<<jj<<":"<<dbs[jj]; cout<<endl; | |
} | |
} | |
} | |
void RefSeq::AllocIndex() { | |
KmerLoc *v; KmerLoc2 *u; | |
bit32_t i, j, t, *ptr, block_size=1<<22, ptr_count=0; | |
if(param.RRBS_flag){ | |
for(v=index,i=0; i<total_kmers; v++,i++) { | |
if(v->n1>0) { | |
v->loc1= new Hit[v->n1]; | |
v->n1=0; | |
} | |
} | |
} | |
else { | |
bit32_t *kmer_count=new bit32_t[total_kmers]; | |
ptr = new bit32_t[block_size]; // 64MB | |
mem_pool.push_back(ptr); | |
for(i=0,u=index2; i<total_kmers; i++,u++) { | |
t=u->n[1]+u->n[0]; kmer_count[i]=t; | |
if(t==0) continue; | |
if(ptr_count+t>=block_size-PREFETCH_LOOP) { | |
if(t>=block_size-PREFETCH_LOOP) { | |
u->loc1= new bit32_t[t+PREFETCH_LOOP]; | |
mem_pool.push_back(u->loc1); | |
for(j=0;j<PREFETCH_LOOP;j++) (u->loc1)[t+j]=0; | |
u->n[1]=0; | |
continue; | |
} | |
for(j=0;j<PREFETCH_LOOP;j++) ptr[ptr_count+j]=0; | |
ptr = new bit32_t[block_size]; | |
mem_pool.push_back(ptr); | |
ptr_count=0; | |
} | |
u->loc1=ptr+ptr_count; ptr_count+=t; | |
u->n[1]=0; | |
} | |
for(j=0;j<PREFETCH_LOOP;j++) ptr[ptr_count+j]=0; | |
sort(kmer_count, kmer_count+total_kmers-1); | |
param.max_kmer_num=kmer_count[(bit32_t)(total_kmers*(1-param.max_kmer_ratio))-1]; | |
//cout<<"kmer_threshold:"<<param.max_kmer_num<<" kmer_ratio:"<<param.max_kmer_ratio<<endl; | |
delete [] kmer_count; | |
} | |
} | |
void RefSeq::ReleaseIndex() { | |
bit32_t i; | |
KmerLoc *v; KmerLoc2 *u; | |
if(param.RRBS_flag){ | |
for(i=0,v=index; i<total_kmers; i++,v++) { | |
if(v->n1) delete [] v->loc1; | |
} | |
delete [] index; | |
} | |
else { | |
while(!mem_pool.empty()) { | |
delete [] mem_pool.back(); | |
mem_pool.pop_back(); | |
} | |
delete [] index2; | |
} | |
} | |
void RefSeq::FillIndex() { | |
bit64_t *_m; bit32_t chr, chr1, j,tmp_offset; | |
KmerLoc *z; Hit tmphit; | |
vector<ref_loc_t>::iterator it; | |
if(param.RRBS_flag) { | |
for(j=0;j<param.max_seedseg_num;j++) { | |
for(chr=0;chr<_count;chr++) { | |
tmphit.chr=chr|(j<<16); _m=bfa[chr].s; | |
for(it=CCGG_index[j][chr].begin();it!=CCGG_index[j][chr].end();++it) { | |
z=index+s_MakeSeed_1(_m+(*it)/SEGLEN,(*it)%SEGLEN); | |
tmphit.loc=(*it); | |
z->loc1[z->n1++]=tmphit; | |
} | |
if(param.pairend||param.chains) { | |
chr1=chr^1; tmp_offset=title[chr].rc_offset-param.seed_size; | |
tmphit.chr=chr|(j<<16)|0x1000000; | |
for(it=CCGG_index[j][chr1].begin();it!=CCGG_index[j][chr1].end();++it) { | |
if(tmp_offset<*it) continue; | |
z=index+s_MakeSeed_1(_m+(int)(tmp_offset-*it)/SEGLEN,(tmp_offset-*it)%SEGLEN); | |
tmphit.loc=(tmp_offset-*it); | |
z->loc1[z->n1++]=tmphit; | |
} | |
} | |
} | |
} | |
} | |
else { | |
t_FillIndex(0); | |
t_FillIndex(1); | |
} | |
//_a=0; _e=0; for(z=index,i=0; i<total_kmers; z++,i++) if(z->n1) {_a+=z->n1; _e++;} cout<<"index total:"<<_a<<" keys:"<<_e<<endl; | |
//_a=0; _e=0; for(z=cindex, i=0; i<total_kmers; z++,i++) if(z->n1) {_a+=z->n1; _e++;} cout<<"cindex total:"<<_a<<" keys:"<<_e<<endl; | |
/* | |
for(ref_id_t chr=0;chr<_count;chr++){ | |
cout<<"chr"<<(int)chr<<endl; | |
for(set<ref_loc_t>::iterator it=CCGG_index[chr].begin();it!=CCGG_index[chr].end();it++) cout<<" "<<*it; cout<<endl; | |
for(set<ref_loc_t>::iterator it=CCGG_cindex[chr].begin();it!=CCGG_cindex[chr].end();it++) cout<<" "<<*it; cout<<endl; | |
} | |
*/ | |
} | |
void RefSeq::t_FillIndex(bit32_t ref_chain) { | |
KmerLoc2 *z2; Hit tmphit; | |
bit64_t *_m; bit32_t i2,j,i,prefetch, tmp=1-ref_chain, ptr=0; | |
bit32_t dbs[PREFETCH_CRT_UNIT]; | |
prefetch=PREFETCH_CRT_UNIT*param.index_interval; | |
for(vector<Block>::iterator p=_blocks.begin(); p!=_blocks.end(); p++) { | |
if(p->id%2!=ref_chain) continue; | |
tmphit.chr=p->id; _m=bfa[tmphit.chr].s; | |
i2=((p->end-param.seed_size)/param.index_interval)*param.index_interval; | |
for(j=0,i=(p->begin/param.index_interval)*param.index_interval; j<PREFETCH_CRT_UNIT; i+=param.index_interval,++j,++ptr) { | |
dbs[ptr%PREFETCH_CRT_UNIT]=s_MakeSeed_1(_m+i/SEGLEN,i%SEGLEN); | |
} | |
for(tmphit.loc=(p->begin/param.index_interval)*param.index_interval; tmphit.loc<=i2; tmphit.loc+=param.index_interval,++ptr) { | |
z2=index2+dbs[ptr%PREFETCH_CRT_UNIT]; | |
z2->loc1[z2->n[tmp]++]=hit2int(tmphit); | |
j=s_MakeSeed_1(_m+(tmphit.loc+prefetch)/SEGLEN,(tmphit.loc+prefetch)%SEGLEN); | |
dbs[ptr%PREFETCH_CRT_UNIT]=j; | |
__builtin_prefetch(index2+j,1,0); | |
} | |
} | |
} | |
void RefSeq::CreateIndex() { | |
bit32_t i; | |
InitialIndex(); | |
CalKmerFreq(); | |
AllocIndex(); | |
FillIndex(); | |
FinishIndex(); | |
} | |
void RefSeq::FinishIndex() { | |
bit32_t i; | |
if(param.RRBS_flag) | |
for(i=0;i<param.max_seedseg_num;i++) CCGG_index[i].clear(); | |
/* | |
for(i=0;i<total_kmers;i++) { | |
cout<<"seed:"<<i<<" total:"<<index2[i][0]<<endl; | |
for(j=1; j<=index2[i][0]; j++) cout<<index2[i][j]<<" "; cout<<endl; | |
//for(k=0;k<4;k++) { | |
while(true){ | |
cout<<"c"<<(index2[i][j]>>24)<<":"<<(index2[i][j]&0xffffff)<<" "; | |
if((index2[i][j]&0xffffff)==index2[i][0]) break; | |
j++; | |
} | |
cout<<endl; | |
} | |
*/ | |
} | |
pair<ref_loc_t,bit32_t> RefSeq::CCGG_seglen(ref_id_t chr, ref_loc_t pos, int readlen) { | |
ref_id_t chr2=chr/2; | |
bit32_t left, right, mid; | |
ref_loc_t midval,seg_start, seg_end; | |
CCGG_sites_chr=&CCGG_sites[chr2]; | |
left=0; //right=CCGG_sites[chr2].size()-1; | |
right=CCGG_sites_chr->size()-1; | |
while(left<right-1) { | |
mid=(left+right)/2; | |
//cout<<"pos:"<<pos<<"\tleft:"<<left<<" "<<CCGG_sites[chr2][left]<<"\tright:"<<right<<" "<<CCGG_sites[chr2][right]<<"\tmid:"<<mid<<" "<<CCGG_sites[chr2][mid]<<endl; | |
if((midval=(*CCGG_sites_chr)[mid].first)==pos) {//return pair<ref_loc_t,int>(pos+1,CCGG_sites[chr2][mid+1]-midval+2); | |
left=mid;right=mid+1;break; | |
} | |
else if(midval<pos) left=mid; | |
//if(CCGG_sites[chr2][mid]<=pos) left=mid; | |
else right=mid; | |
} | |
//cout<<"endloop\n"; | |
//cout<<"pos:"<<pos<<"\tleft:"<<left<<" "<<CCGG_sites[chr2][left]<<"\tright:"<<right<<" "<<CCGG_sites[chr2][right]<<"\tmid:"<<mid<<" "<<CCGG_sites[chr2][mid]<<endl; | |
//for (mid=left-2;mid<left+3;mid++) cout<<"\t"<<CCGG_sites[chr2][mid]; cout<<endl; | |
seg_start=(*CCGG_sites_chr)[left].first; | |
while(((seg_end=(*CCGG_sites_chr)[right].first+(*CCGG_sites_chr)[right].second)<pos+readlen)&&(right<CCGG_sites_chr->size())) right++; | |
//cout<<"pos:"<<pos<<"\tleft:"<<left<<" "<<CCGG_sites[chr2][left]<<"\tright:"<<right<<" "<<CCGG_sites[chr2][right]<<"\tmid:"<<mid<<" "<<CCGG_sites[chr2][mid]<<endl;} | |
//cout<<"ZP:"<<(seg_start+1)<<" ZL:"<<(seg_end-seg_start)<<endl; | |
//return pair<ref_loc_t,int>(CCGG_sites[chr2][left]+1, CCGG_sites[chr2][right]-CCGG_sites[chr2][left]+2); | |
return pair<ref_loc_t,bit32_t>(seg_start+1, seg_end-seg_start); | |
} | |
ref_loc_t RefSeq::hit2int(Hit h) { | |
return ref_anchor[h.chr/2]+h.loc; | |
} | |