#include <iomanip> #include <limits.h> #include <unistd.h> #include <cstdio> #include <iostream> #include <iomanip> #include <ostream> #include <fstream> #include <string> #include <sstream> #include <vector> #include "reads.h" #include "dbseq.h" #include "align.h" #include "param.h" #include "pairs.h" #include "utilities.h" #ifdef THREAD #include <pthread.h> #endif using namespace std; //global variables Param param; string query_a_file; string query_b_file; string ref_file; string out_align_file; string command_line; ifstream fin_db; igzstream gzfin_db; ifstream fin_a; igzstream gzfin_a; ifstream fin_b; igzstream gzfin_b; ofstream fout; FILE *pout; ReadClass read_a, read_b; RefSeq ref; bit32_t n_aligned=0, n_unique=0, n_multiple=0; //number of reads aligned bit32_t n_aligned_pairs=0, n_unique_pairs=0, n_multiple_pairs=0; //number of pairs aligned bit32_t n_aligned_a=0, n_unique_a=0, n_multiple_a=0; //number of a reads aligned bit32_t n_aligned_b=0, n_unique_b=0, n_multiple_b=0; //number of b reads aligned bit32_t ref_time, read_time; bit16_t tid[64]; char version[] = "2.90"; ostringstream message; void info(int level) { if(level<=param.verbose_level) cerr<<message.str(); message.str(""); } #ifdef THREAD pthread_mutex_t mutex_fin=PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t mutex_fout=PTHREAD_MUTEX_INITIALIZER; void *t_SingleAlign(void *tid) { SingleAlign a; int n; bit32_t cur_at; while(1) { pthread_mutex_lock(&mutex_fin); n=read_a.LoadBatchReads(fin_a,gzfin_a,0); cur_at=read_a._index; a.ImportBatchReads(read_a.num, read_a.mreads); pthread_mutex_unlock(&mutex_fin); if(!n) break; a.Do_Batch(ref); pthread_mutex_lock(&mutex_fout); if(param.stdout) cout<<a._str_align; else if(param.pipe_out) {fwrite(a._str_align.c_str(),1,a._str_align.size(),pout); fflush(pout);} else fout<<a._str_align; message<<"[bsmap] @"<<Curr_Time()<<" \t"<<cur_at-param.read_start+1<<" reads finished. "<<Cal_AllTime()<<" secs passed"<<endl; info(2); pthread_mutex_unlock(&mutex_fout); } pthread_mutex_lock(&mutex_fout); n_aligned+=a.n_aligned; n_unique+=a.n_unique; n_multiple+=a.n_multiple; pthread_mutex_unlock(&mutex_fout); read_time+=Cal_AllTime()-ref_time; return NULL; }; void Do_SingleAlign() { vector<pthread_t> pthread_ids(param.num_procs); for(int i=0; i<param.num_procs; i++) { tid[i]=i; pthread_create(&pthread_ids[i], NULL, t_SingleAlign, (void*)&tid[i]); } for(int i=0; i<param.num_procs; i++) pthread_join(pthread_ids[i], NULL); }; void *t_PairAlign(void *tid) { PairAlign a; int n1, n2; bit32_t cur_at; while(1) { pthread_mutex_lock(&mutex_fin); n1=read_a.LoadBatchReads(fin_a,gzfin_a,1); n2=read_b.LoadBatchReads(fin_b,gzfin_b,2); cur_at=read_a._index; a.ImportBatchReads(n1, read_a.mreads, read_b.mreads); pthread_mutex_unlock(&mutex_fin); if(!n1||(n1!=n2)) break; a.Do_Batch(ref); pthread_mutex_lock(&mutex_fout); if(param.stdout) cout<<a._str_align; else if(param.pipe_out) {fwrite(a._str_align.c_str(),1,a._str_align.size(),pout); fflush(pout);} else fout<<a._str_align; message<<"[bsmap] @"<<Curr_Time()<<" \t"<<cur_at-param.read_start+1<<" read pairs finished. "<<Cal_AllTime()<<" secs passed"<<endl;info(2); pthread_mutex_unlock(&mutex_fout); } pthread_mutex_lock(&mutex_fout); n_aligned_pairs+=a.n_aligned_pairs; n_unique_pairs+=a.n_unique_pairs; n_multiple_pairs+=a.n_multiple_pairs; n_aligned_a+=a.n_aligned_a; n_unique_a+=a.n_unique_a; n_multiple_a+=a.n_multiple_a; n_aligned_b+=a.n_aligned_b; n_unique_b+=a.n_unique_b; n_multiple_b+=a.n_multiple_b; pthread_mutex_unlock(&mutex_fout); read_time+=Cal_AllTime()-ref_time; return NULL; }; void Do_PairAlign() { //if(param.max_snp_num>0) param.max_snp_num=2; vector<pthread_t> pthread_ids(param.num_procs); //create //cout <<param.num_procs<<"num_procs\n"; for(int i=0; i<param.num_procs; i++) { tid[i]=i; pthread_create(&pthread_ids[i], NULL, t_PairAlign, (void*)&tid[i]); } for(int i=0; i<param.num_procs; i++) pthread_join(pthread_ids[i], NULL); }; void* wrapper_CalKmerFreq0(void*) {ref.t_CalKmerFreq(0); return NULL;} void* wrapper_CalKmerFreq1(void*) {ref.t_CalKmerFreq(1); return NULL;} void* wrapper_FillIndex0(void*) {ref.t_FillIndex(0); return NULL;} void* wrapper_FillIndex1(void*) {ref.t_FillIndex(1); return NULL;} void Do_Formatdb() { if(param.RRBS_flag) ref.CreateIndex(); else { pthread_t t0, t1; ref.InitialIndex(); pthread_create(&t0, NULL, wrapper_CalKmerFreq0, NULL); pthread_create(&t1, NULL, wrapper_CalKmerFreq1, NULL); pthread_join(t0, NULL); pthread_join(t1, NULL); ref.AllocIndex(); pthread_create(&t0, NULL, wrapper_FillIndex0, NULL); pthread_create(&t1, NULL, wrapper_FillIndex1, NULL); pthread_join(t0, NULL); pthread_join(t1, NULL); ref.FinishIndex(); } message<<"[bsmap] @"<<Curr_Time()<<" \tcreate seed table. "<<Cal_AllTime()<<" secs passed\n"; info(1); }; #else void Do_SingleAlign() { SingleAlign a; while(read_a.LoadBatchReads(fin_a,gzfin_a,0)) { a.ImportBatchReads(read_a.num, read_a.mreads); a.Do_Batch(ref); if(param.stdout) cout<<a._str_align; else if(param.pipe_out) {fwrite(a._str_align.c_str(),1,a._str_align.size(),pout); fflush(pout);} else fout<<a._str_align; message<<"[bsmap] @"<<Curr_Time()<<" \t"<<read_a._index-param.read_start+1<<" reads finished. "<<Cal_AllTime()<<" secs passed"<<endl; info(2); } n_aligned=a.n_aligned; n_unique=a.n_unique; n_multiple=a.n_multiple; read_time+=Cal_AllTime()-ref_time; }; void Do_PairAlign() { PairAlign a; int n1, n2; while(1) { n1=read_a.LoadBatchReads(fin_a,gzfin_a,1); n2=read_b.LoadBatchReads(fin_b,gzfin_b,2); if(!n1||(n1!=n2)) break; a.ImportBatchReads(n1, read_a.mreads, read_b.mreads); a.Do_Batch(ref); if(param.stdout) cout<<a._str_align; else if(param.pipe_out) {fwrite(a._str_align.c_str(),1,a._str_align.size(),pout); fflush(pout);} else fout<<a._str_align; message<<"[bsmap] @"<<Curr_Time()<<" \t"<<read_a._index-param.read_start+1<<" read pairs finished. "<<Cal_AllTime()<<" secs passed"<<endl; info(2); } n_aligned_pairs+=a.n_aligned_pairs; n_unique_pairs+=a.n_unique_pairs; n_multiple_pairs+=a.n_multiple_pairs; n_aligned_a+=a.n_aligned_a; n_unique_a+=a.n_unique_a; n_multiple_a+=a.n_multiple_a; n_aligned_b+=a.n_aligned_b; n_unique_b+=a.n_unique_b; n_multiple_b+=a.n_multiple_b; read_time+=Cal_AllTime()-ref_time; }; void Do_Formatdb() { ref.CreateIndex(); message<<"[bsmap] @"<<Curr_Time()<<" \tcreate seed table. "<<Cal_AllTime()<<" secs passed\n"; info(1); }; #endif //usage void usage(void) { cerr<<"Usage: bsmap [options]\n" <<" -a <str> query a file, FASTA/FASTQ/BAM format\n" <<" -d <str> reference sequences file, FASTA format\n" <<" -o <str> output alignment file, BSP/SAM/BAM format, if omitted, the output will be written to STDOUT in SAM format.\n" <<"\n Options for alignment:\n" <<" -s <int> seed size, default=16(WGBS mode), 12(RRBS mode). min=8, max=16.\n" <<" -v <float> if this value is between 0 and 1, it's interpreted as the mismatch rate w.r.t to the read length.\n" <<" otherwise it's interpreted as the maximum number of mismatches allowed on a read, <="<<MAXSNPS<<".\n" <<" example: -v 5 (max #mismatches = 5), -v 0.1 (max #mismatches = read_length * 10%)\n" <<" default="<<(param.max_snp_num-100)/100.0<<".\n" <<" -g <int> gap size, BSMAP only allow 1 continuous gap (insert or deletion) with up to "<<MAXGAPS<<" nucleotides\n" <<" default="<<param.gap<<"\n" <<" -w <int> maximum number of equal best hits to count, <="<<MAXHITS<<"\n" <<" -3 using 3-nucleotide mapping approach\n" <<" -B <int> start from the Nth read or read pair, default: 1\n" <<" -E <int> end at the Nth read or read pair, default: 4,294,967,295\n" <<" -I <int> index interval, default="<<param.index_interval<<"\n" <<" -k <float> set the cut-off ratio for over-represented kmers, default="<<param.max_kmer_ratio<<"\n" <<" example: -k 1e-6 means the top 0.0001\% over-represented kmer will be skipped in alignment\n" #ifdef THREAD <<" -p <int> number of processors to use, default="<<param.num_procs<<"\n" #endif <<" -D <str> activating RRBS mapping mode and set restriction enzyme digestion sites. \n" <<" digestion position marked by \'-\', example: -D C-CGG for MspI digestion.\n" <<" default: none (whole genome shotgun bisulfite mapping mode)\n" <<" -S <int> seed for random number generation used in selecting multiple hits\n" <<" other seed values generate pseudo random number based on read index number, to allow reproducible mapping results. \n" <<" default="<<param.randseed<<". (get seed from system clock, mapping results not resproducible.)\n" <<" -n [0,1] set mapping strand information. default: "<<param.chains<<"\n" <<" -n 0: only map to 2 forward strands, i.e. BSW(++) and BSC(-+), \n" <<" for PE sequencing, map read#1 to ++ and -+, read#2 to +- and --.\n" <<" -n 1: map SE or PE reads to all 4 strands, i.e. ++, +-, -+, -- \n" <<" -M <str> set alignment information for the additional nucleotide transition. \n" <<" <str> is in the form of two different nucleotides N1N2, \n" <<" indicating N1 in the reads could be mapped to N2 in the reference sequences.\n" <<" default: -M TC, corresponds to C=>U(T) transition in bisulfite conversion. \n" <<" example: -M GA could be used to detect A=>I(G) transition in RNA editing. \n" <<"\n Options for trimming:\n" <<" -q <int> quality threshold in trimming, 0-40, default=0 (no trim)\n" <<" -z <int> base quality, default="<<(int) param.zero_qual<<" [Illumina is using 64, Sanger Institute is using 33]\n" <<" -f <int> filter low-quality reads containing >n Ns, default="<<param.max_ns<<"\n" <<" -A <str> 3-end adapter sequence, default: none (no trim)\n" <<" -L <int> map the first N nucleotides of the read, default:"<<param.max_readlen<<" (map the whole read).\n" <<"\n Options for reporting:\n" <<" -r [0,1,2] how to report repeat hits, 0=none(unique hit/pair); 1=random one; 2=all(slow), default:"<<param.report_repeat_hits<<".\n" <<" -R print corresponding reference sequences in SAM output, default=off\n" <<" -u report unmapped reads, default=off\n" <<" -H do not print header information in SAM format output\n" <<" -V [0,1,2] verbose level: 0=no message displayed (quiet mode); \n" <<" 1=major message (default); 2=detailed message.\n" <<"\n Options for pair-end alignment:\n" <<" -b <str> query b file\n" <<" -m <int> minimal insert size allowed, default="<<param.min_insert<<"\n" <<" -x <int> maximal insert size allowed, default="<<param.max_insert<<"\n\n" <<" -h help\n\n"; exit(1); }; int mGetOptions(int rgc, char *rgv[]) { //[options] int i; for(i=1,command_line=rgv[0];i<rgc;i++) command_line=command_line+" "+rgv[i]; for(i=1; i<rgc; i++) { if(rgv[i][0]!='-') return i; switch(rgv[i][1]) { case 'a': if(rgv[i][2]==0) query_a_file = rgv[++i]; else if(rgv[i][2]=='=') query_a_file=rgv[i]+3; else return i; break; case 'b': if(rgv[i][2]==0) query_b_file = rgv[++i]; else if(rgv[i][2]=='=') query_b_file=rgv[i]+3; else return i; param.pairend=1; break; case 'd': if(rgv[i][2]==0) ref_file = rgv[++i]; else if(rgv[i][2]=='=') ref_file=rgv[i]+3; else return i; break; case 's': if(rgv[i][2]==0) param.SetSeedSize(atoi(rgv[++i])); else if(rgv[i][2]=='=') param.SetSeedSize(atoi(rgv[i]+3)); else return i; //if(param.RRBS_flag) param.SetSeedSize(12); break; case 'o': if(rgv[i][2]==0) out_align_file = rgv[++i]; else if(rgv[i][2]=='=') out_align_file=rgv[i]+3; else return i; param.stdout=0; break; case 'm': if(rgv[i][2]==0) param.min_insert = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.min_insert=atoi(rgv[i]+3); else return i; break; //case 'n': if(rgv[i][2]==0) param.chains = atoi(rgv[++i])%4; else if(rgv[i][2]=='=') param.chains=atoi(rgv[i]+3)%4; else return i; break; case 'n': if(rgv[i][2]==0) param.chains=(atoi(rgv[++i])!=0); else if(rgv[i][2]=='=') param.chains=(atoi(rgv[i]+3)!=0); else return i; break; case 'g': if(rgv[i][2]==0) param.gap=atoi(rgv[++i]); else if(rgv[i][2]=='=') param.gap=atoi(rgv[i]+3); else return i; if(param.gap>MAXGAPS) { cerr<<"warning: gap length exceeds max value:"<<MAXGAPS<<endl; param.gap=MAXGAPS; } break; case 'x': if(rgv[i][2]==0) param.max_insert = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.max_insert=atoi(rgv[i]+3); else return i; break; case 'r': if(rgv[i][2]==0) param.report_repeat_hits = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.report_repeat_hits=atoi(rgv[i]+3); else return i; if(param.report_repeat_hits>2||param.report_repeat_hits<0) { cerr<<"invalid -r value: "<<param.report_repeat_hits<<", must be 0, 1, or 2.\n"; exit(1); } break; case 'V': if(rgv[i][2]==0) param.verbose_level = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.verbose_level=atoi(rgv[i]+3); else return i; if(param.verbose_level>2||param.verbose_level<0) { cerr<<"invalid -V value: "<<param.verbose_level<<", must be 0, 1, or 2.\n"; exit(1); } break; case 'I': if(rgv[i][2]==0) param.index_interval = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.index_interval=atoi(rgv[i]+3); else return i; if(param.RRBS_flag) param.index_interval=1; if(param.index_interval>16) {cerr<<"index interval exceeds max value:16\n"; exit(1);} break; case 'k': if(rgv[i][2]==0) param.max_kmer_ratio=atof(rgv[++i]); else if(rgv[i][2]=='=') param.max_kmer_ratio=atof(rgv[i]+3); else return i; break; case 'v': double tmp_max_snp; if(rgv[i][2]==0) tmp_max_snp = atof(rgv[++i]); else if(rgv[i][2]=='=') tmp_max_snp=atof(rgv[i]+3); else return i; if(tmp_max_snp<1.0) { param.max_snp_num=(int)(tmp_max_snp*100+0.5)+100; if(param.max_snp_num==100) param.max_snp_num=0; } else{ param.max_snp_num=(int)(tmp_max_snp+0.5); if(param.max_snp_num>MAXSNPS) { cerr<<"warning: number of mismatches exceeds max value:"<<MAXSNPS<<endl; param.max_snp_num=MAXSNPS; } } break; case 'w': if(rgv[i][2]==0) param.max_num_hits = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.max_num_hits=atoi(rgv[i]+3); else return i; if(param.max_num_hits>MAXHITS) {cerr<<"number of multi-hits exceeds max value:"<<MAXHITS<<endl; exit(1);} break; case 'q': if(rgv[i][2]==0) param.qual_threshold = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.qual_threshold = atoi(rgv[i]+3); else return i; break; case 'f': if(rgv[i][2]==0) param.max_ns=atoi(rgv[++i]); else if(rgv[i][2]=='=') param.max_ns=atoi(rgv[i]+3); else return i; break; case 'z': if(rgv[i][2]==0) param.zero_qual=atoi(rgv[++i]); else if(rgv[i][2]=='=') param.zero_qual=atoi(rgv[i]+3); else return i; break; case 'p': if(rgv[i][2]==0) param.num_procs=atoi(rgv[++i]); else if(rgv[i][2]=='=') param.num_procs=atoi(rgv[i]+3); else return i; break; case 'A': if(rgv[i][2]==0) param.adapter[param.n_adapter++]=rgv[++i]; else if(rgv[i][2]=='=') param.adapter[param.n_adapter++]=rgv[i]+3; else return i; break; case 'R': if(rgv[i][2]==0) param.out_ref=1; else return i; break; case '3': if(rgv[i][2]==0) param.nt3=1; else return i; break; case 'H': if(rgv[i][2]==0) param.sam_header=0; else return i; break; case 'u': if(rgv[i][2]==0) param.out_unmap=1; else return i; break; case 'B': if(rgv[i][2]==0) param.read_start = max(atoi(rgv[++i]),1); else if(rgv[i][2]=='=') param.read_start=max(atoi(rgv[i]+3),1); else return i; break; case 'E': if(rgv[i][2]==0) param.read_end = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.read_end=atoi(rgv[i]+3); else return i; break; case 'D': if(rgv[i][2]==0) param.SetDigestionSite(rgv[++i]); else if(rgv[i][2]=='=') param.SetDigestionSite(rgv[i]+3); else return i; break; case 'M': if(rgv[i][2]==0) {i++; param.SetAlign(rgv[i][0], rgv[i][1]);} else if(rgv[i][2]=='=') param.SetAlign(rgv[i][3], rgv[i][4]); else return i; break; case 'L': if(rgv[i][2]==0) param.max_readlen = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.max_readlen = atoi(rgv[i]+3); else return i; break; case 'N': if(rgv[i][2]==0) param.N_mis=1; else return i; break; case 'S': if(rgv[i][2]==0) param.randseed = atoi(rgv[++i]); else if(rgv[i][2]=='=') param.randseed = atoi(rgv[i]+3); else return i; break; case 'h':usage(); //usage information default: return i; } } param.InitMapping(); return 0; } void check_ofile(string &filename, string &err_msg) { ofstream ff; ff.open(filename.c_str()); cerr.flush(); if(!ff) { cerr<<endl<<err_msg<<filename<<endl; exit(1); } } int check_ifile(string &filename, string &err_msg) { int byte1, byte2; ifstream ff; ff.open(filename.c_str()); cerr.flush(); if(!ff) { cerr<<endl<<err_msg<<filename<<endl; exit(1); } byte1=ff.get(); byte2=ff.get(); return ((byte1==0x1f)&&(byte2==0x8b)); // if input is gzip file or not } int check_ifile_format(string &filename, int gz_flag) { ifstream ff; igzstream gg; string s1,s2,s3,s4; char ch[1000]; if(gz_flag) { gg.open(filename.c_str()); gg>>s1; gg.getline(ch, 1000); if(s1[0]=='>') {message<<" \t(format: gzipped FASTA)\n"; info(1); return 0;} //fasta if(s1[0]=='@') {message<<" \t(format: gzipped FASTQ)\n"; info(1); return 1;} //fastq if(samopen(filename.c_str(), "rb", 0)!=0) {message<<" \t(format: BAM)\n"; info(1); return 3;} //BAM if(samopen(filename.c_str(), "r", 0)!=0) {message<<" \t(format: SAM)\n"; info(1); return 2;} //SAM } else { ff.open(filename.c_str()); ff>>s1; ff.getline(ch, 1000); if(s1[0]=='>') {message<<" \t(format: FASTA)\n"; info(1); return 0;} //fasta if(s1[0]=='@') {message<<" \t(format: FASTQ)\n"; info(1); return 1;} //fastq if(samopen(filename.c_str(), "rb", 0)!=0) {message<<" \t(format: BAM)\n"; info(1); return 3;} //BAM if(samopen(filename.c_str(), "r", 0)!=0) {message<<" \t(format: SAM)\n"; info(1); return 2;} //SAM } cerr<<"\t(format: unknown)\nUnknown input format.\n"; exit(1); } void RunProcess(void) { char _ch[256]; string _str="@HD\tVN:1.0\n"; if(out_align_file.size()>4){ if(out_align_file.compare(out_align_file.size()-4,4,".sam")==0) param.out_sam=1; else if (out_align_file.compare(out_align_file.size()-4,4,".bam")==0) param.out_sam=2; } else param.out_sam=1; if(param.max_snp_num<100) message<<"\tmax number of mismatches: "<<param.max_snp_num; else message<<"\tmax number of mismatches: read_length * "<<param.max_snp_num-100<<"% "; message<<" \tmax gap size: "<<param.gap<<" \tkmer cut-off ratio: "<<param.max_kmer_ratio<<" \tmax multi-hits: "<<param.max_num_hits<<endl; message<<"\tquality cutoff: "<<(int)param.qual_threshold<<" \tbase quality char: '"<<param.zero_qual<<"'"<<" \tmax Ns: "<<param.max_ns; if(param.max_readlen<(FIXELEMENT-1)*SEGLEN) message<<" \thard clip length: "<<param.max_readlen<<endl; if(param.pairend||param.RRBS_flag) message<<"\tmin fragment size: "<<param.min_insert<<" \tmax fragemt size: "<<param.max_insert<<endl; if(param.nt3) message<<"\t3-nucleotide alignment mode: "<<param.read_nt<<" <=> "<<param.ref_nt; else message<<"\twildcard alignment mode: "<<param.read_nt<<" in reads => "<<param.ref_nt<<" in reference"; message<<" \tseed size: "<<param.seed_size<<" \tindex interval: "<<param.index_interval<<endl; message<<"\talignment report: "; if(param.report_repeat_hits==0) message<<"unique alignments only"; if(param.report_repeat_hits==1) message<<"unique + one random multiple alignments"; if(param.report_repeat_hits==2) message<<"unique + all multiple alignments"; message<<endl; if(param.pairend){ message<<"\tmapping strand (read_1): ++,-+"; if(param.chains) message<< ",+-,--"; message<<"\tmapping strand (read_2): +-,--"; if(param.chains) message<< ",++,-+"; } else { message<<"\tmapping strand: ++,-+"; if(param.chains) message<< ",+-,--"; } message<<endl; for(bit32_t i=0; i<param.n_adapter;i++) message<<"\tadapter sequence #"<<i+1<<": "<<param.adapter[i]<<endl; if(param.RRBS_flag) { message<<"\tRRBS digestion site: "; for(bit32_t i=0; i<param.digest_site.size(); i++) message<<param.digest_site[i].substr(0,param.digest_pos[i])<<'-'<<param.digest_site[i].substr(param.digest_pos[i])<<" "; message<<endl; } info(2); string err_msg; //pair-end alignment if(param.pairend) { message<<"[bsmap] @"<<Curr_Time()<<" \tPair-end alignment("<<param.num_procs<<" threads),"; info(1); message<<" \tstart from read pair #"<<param.read_start; if(~param.read_end) message<<" \tend at read pair #"<<param.read_end; info(2); message<<endl; info(1); err_msg="failed to open read file #1 (check -a option): "; message<<"\tInput read file #1: "<<query_a_file; param.gz_input=check_ifile(query_a_file, err_msg); param.input_format=check_ifile_format(query_a_file, param.gz_input); read_a.InitIndex(fin_a, gzfin_a, query_a_file); err_msg="failed to open read file #2 (check -b option): "; message<<"\tInput read file #2: "<<query_b_file; if(param.gz_input!=check_ifile(query_b_file, err_msg)) { cerr<<"Input read file #1 and #2 should be both regular files or gzip files."<<endl; exit(1); } if(param.input_format!=check_ifile_format(query_b_file, param.gz_input)) { cerr<<"Input read file #1 and #2 should be in same format."<<endl; exit(1); } read_b.InitIndex(fin_b, gzfin_b, query_b_file); if(param.stdout) message<<"\tOutput: STDOUT\t (format: SAM)\n"; else { cerr<<"\tOutput file: "<<out_align_file; switch(param.out_sam) { case 0: message<<"\t (format: BSP)\n"; break; case 1: message<<"\t (format: SAM)\n"; break; case 2: message<<"\t (format: SAM, automatically convert to BAM)\n"; break; } err_msg="failed to open output file (check -o option): "; check_ofile(out_align_file, err_msg); if(param.out_sam==2) { string samtools_command="samtools view -bS - >"+out_align_file; if((pout=popen(samtools_command.c_str(),"w"))==NULL) { cerr.flush(); message<<"unable to creat samtools pipe, using post alignment conversion instead.\n"; fout.open(out_align_file.c_str()); } else param.pipe_out=1; } else fout.open(out_align_file.c_str()); } if(param.out_sam&¶m.sam_header) { for(bit32_t i=0;i<ref.total_num;i++){ sprintf(_ch,"@SQ\tSN:%s\tLN:%u\n",ref.title[i<<1].name.c_str(),ref.title[i<<1].size); _str.append(_ch); } sprintf(_ch,"@PG\tID:BSMAP\tVN:%s\tCL:\"%s\"\n",version,command_line.c_str()); _str.append(_ch); if(param.stdout) cout<<_str; else if(param.pipe_out) fwrite(_str.c_str(),1,_str.size(),pout); else fout<<_str; } n_aligned_pairs=n_aligned_a=n_aligned_b=0; info(1); Do_PairAlign(); if(param.pipe_out) pclose(pout); else fout.close(); if(param.input_format==3) { bam_destroy1(read_a.SAM_b); samclose(read_a.SAM_fp); bam_destroy1(read_b.SAM_b); samclose(read_b.SAM_fp); } message<<fixed<<setw(4)<<setprecision(1); message<<"[bsmap] @"<<Curr_Time()<<" \ttotal read pairs: "<<read_a._index-param.read_start+1<<" \ttotal time consumed: "<<Cal_AllTime()<<" secs\n"; message<<"\taligned pairs: "<<n_aligned_pairs<<" ("<<100.0*n_aligned_pairs/(read_a._index-param.read_start+1)<<"%), "; message<<"unique pairs: "<<n_unique_pairs<<" ("<<100.0*n_unique_pairs/(read_a._index-param.read_start+1)<<"%), "; if(param.report_repeat_hits==0) message<<"suppressed "; message<<"non-unique pairs: "<<n_multiple_pairs<<" ("<<100.0*n_multiple_pairs/(read_a._index-param.read_start+1)<<"%)\n"; message<<"\tunpaired read #1: "<<n_aligned_a<<" ("<<100.0*n_aligned_a/(read_a._index-param.read_start+1)<<"%), "; message<<"unique reads: "<<n_unique_a<<" ("<<100.0*n_unique_a/(read_a._index-param.read_start+1)<<"%), "; if(param.report_repeat_hits==0) message<<"suppressed "; message<<"non-unique reads: "<<n_multiple_a<<" ("<<100.0*n_multiple_a/(read_a._index-param.read_start+1)<<"%)\n"; message<<"\tunpaired read #2: "<<n_aligned_b<<" ("<<100.0*n_aligned_b/(read_b._index-param.read_start+1)<<"%), "; message<<"unique reads: "<<n_unique_b<<" ("<<100.0*n_unique_b/(read_b._index-param.read_start+1)<<"%), "; if(param.report_repeat_hits==0) message<<"suppressed "; message<<"non-unique reads: "<<n_multiple_b<<" ("<<100.0*n_multiple_b/(read_b._index-param.read_start+1)<<"%)\n"; info(1); } //single-read alignment else { message<<"[bsmap] @"<<Curr_Time()<<" \tSingle-end alignment("<<param.num_procs<<" threads),"; info(1); message<<" \tstart from read #"<<param.read_start; if(~param.read_end) message<<" \tend at read #"<<param.read_end; info(2); cerr<<endl; info(1); err_msg="failed to open read file (check -a option): "; message<<"\tInput read file: "<<query_a_file; param.gz_input=check_ifile(query_a_file, err_msg); param.input_format=check_ifile_format(query_a_file, param.gz_input); read_a.InitIndex(fin_a, gzfin_a, query_a_file); if(param.stdout) message<<"\tOutput: STDOUT\t (format: SAM)\n"; else { message<<"\tOutput file: "<<out_align_file; switch(param.out_sam) { case 0: message<<"\t (format: BSP)\n"; break; case 1: message<<"\t (format: SAM)\n"; break; case 2: message<<"\t (format: SAM, automatically convert to BAM)\n"; break; } err_msg="failed to open output file (check -o option): "; check_ofile(out_align_file, err_msg); if(param.out_sam==2) { string samtools_command="samtools view -bS - >"+out_align_file; if((pout=popen(samtools_command.c_str(),"w"))==NULL) { message.flush(); message<<"unable to creat samtools pipe, using post alignment conversion instead.\n"; fout.open(out_align_file.c_str()); } else param.pipe_out=1; } else fout.open(out_align_file.c_str()); } if(param.out_sam&¶m.sam_header) { char _ch[1000]; for(bit32_t i=0;i<ref.total_num;i++) { sprintf(_ch,"@SQ\tSN:%s\tLN:%u\n",ref.title[i<<1].name.c_str(),ref.title[i<<1].size); _str.append(_ch); } sprintf(_ch,"@PG\tID:BSMAP\tVN:%s\tCL:\"%s\"\n",version,command_line.c_str()); _str.append(_ch); if(param.stdout) cout<< _str; else if(param.pipe_out) fwrite(_str.c_str(),1,_str.size(),pout); else fout<<_str; } n_aligned=0; info(1); Do_SingleAlign(); if(param.pipe_out) pclose(pout); else fout.close(); if(param.input_format==3) { bam_destroy1(read_a.SAM_b); samclose(read_a.SAM_fp); } message<<fixed<<setw(4)<<setprecision(1); message<<"[bsmap] @"<<Curr_Time()<<" \ttotal reads: "<<read_a._index-param.read_start+1<<" \ttotal time: "<<Cal_AllTime()<<" secs\n"; message<<"\taligned reads: "<<n_aligned<<" ("<<100.0*n_aligned/(read_a._index-param.read_start+1)<<"%), "; message<<"unique reads: "<<n_unique<<" ("<<100.0*n_unique/(read_a._index-param.read_start+1)<<"%), "; if(param.report_repeat_hits==0) message<<"suppressed "; message<<"non-unique reads: "<<n_multiple<<" ("<<100.0*n_multiple/(read_a._index-param.read_start+1)<<"%)\n"; info(1); } }; int main(int argc, char *argv[]) { //print usage if (argc == 1) usage(); Initial_Time(); int noptions=mGetOptions(argc, argv); if(noptions) { cerr<<"unknown option: "<<argv[noptions]<<endl; exit(noptions); } message<<"\nBSMAP v"<<version<<endl; info(2); srand(time(NULL)); string err_msg="failed to open reference file (check -d option): "; message<<"[bsmap] @"<<Curr_Time()<<" \tloading reference file: "<<ref_file; info(1); param.gz_ref=check_ifile(ref_file, err_msg); if(check_ifile_format(ref_file, param.gz_ref)) { cerr<<"reference must be in FASTA format.\n"; exit(1); } if(param.gz_ref) gzfin_db.open(ref_file.c_str()); else fin_db.open(ref_file.c_str()); ref.Run_ConvertBinseq(fin_db, gzfin_db); message<<"[bsmap] @"<<Curr_Time()<<" \t"<<ref.total_num<<" reference seqs loaded, total size "<<ref.sum_length<<" bp. "<<Cal_AllTime()<<" secs passed"<<endl; info(1); Do_Formatdb(); ref_time=Cal_AllTime(); read_time=0; RunProcess(); /* cerr<<(read_a._index-param.read_start+1)<<" reads in "<<read_time<<" sec."<<endl; cerr.precision(1); cerr<<fixed<<1.0*(read_a._index-param.read_start+1)/read_time<<" read"; if(param.pairend) cerr<<" pairs per sec."<<endl; else cerr<<"s per sec."<<endl; */ if(param.out_sam==2&¶m.pipe_out==0){ char sys_cmd[PATH_MAX+20], abs_bam_file[PATH_MAX]; char *res=realpath(out_align_file.c_str(), abs_bam_file); if(res) { sprintf(sys_cmd,"sam2bam.sh %s",abs_bam_file); system(sys_cmd); } else { message<<"[bsmap] warning: cannot get absolute path for sam2bam.sh input. alignment file remains in SAM format.\n"; } } info(1); ref.ReleaseIndex(); return 0; }