+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See below for user classes.
+// ----------------------------------------------------------------------------
+
+class gzstreambuf : public std::streambuf {
+private:
+ static const int bufferSize = 47+256; // size of data buff
+ // totals 512 bytes under g++ for igzstream at the end.
+
+ gzFile file; // file handle for compressed file
+ char buffer[bufferSize]; // data buffer
+ char opened; // open/close state of stream
+ int mode; // I/O mode
+
+ int flush_buffer();
+public:
+ gzstreambuf() : opened(0) {
+ setp( buffer, buffer + (bufferSize-1));
+ setg( buffer + 4, // beginning of putback area
+ buffer + 4, // read position
+ buffer + 4); // end position
+ // ASSERT: both input & output capabilities will not be used together
+ }
+ int is_open() { return opened; }
+ gzstreambuf* open( const char* name, int open_mode);
+ gzstreambuf* close();
+ ~gzstreambuf() { close(); }
+
+ virtual int overflow( int c = EOF);
+ virtual int underflow();
+ virtual int sync();
+};
+
+class gzstreambase : virtual public std::ios {
+protected:
+ gzstreambuf buf;
+public:
+ gzstreambase() { init(&buf); }
+ gzstreambase( const char* name, int open_mode);
+ ~gzstreambase();
+ void open( const char* name, int open_mode);
+ void close();
+ gzstreambuf* rdbuf() { return &buf; }
+};
+
+// ----------------------------------------------------------------------------
+// User classes. Use igzstream and ogzstream analogously to ifstream and
+// ofstream respectively. They read and write files based on the gz*
+// function interface of the zlib. Files are compatible with gzip compression.
+// ----------------------------------------------------------------------------
+
+class igzstream : public gzstreambase, public std::istream {
+public:
+ igzstream() : std::istream( &buf) {}
+ igzstream( const char* name, int open_mode = std::ios::in)
+ : gzstreambase( name, open_mode), std::istream( &buf) {}
+ gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+ void open( const char* name, int open_mode = std::ios::in) {
+ gzstreambase::open( name, open_mode);
+ }
+};
+
+class ogzstream : public gzstreambase, public std::ostream {
+public:
+ ogzstream() : std::ostream( &buf) {}
+ ogzstream( const char* name, int mode = std::ios::out)
+ : gzstreambase( name, mode), std::ostream( &buf) {}
+ gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+ void open( const char* name, int open_mode = std::ios::out) {
+ gzstreambase::open( name, open_mode);
+ }
+};
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+#endif // GZSTREAM_H
+// ============================================================================
+// EOF //
+
diff --git a/gzstream/index.html b/gzstream/index.html
new file mode 100644
index 0000000..8a9ef8e
--- /dev/null
+++ b/gzstream/index.html
@@ -0,0 +1,145 @@
+
+Gzstream Library Home Page
+
+
+
+Gzstream Library Home Page
+
+
+
+
+ |
+
+ |
+
+
+
+ Introduction
+
+Gzstream is a small C++ library, basically just a wrapper,
+that provides the functionality of the
+zlib C-library in a C++ iostream.
+It is freely available under the LGPL license.
+
+Gzstream has been written by
+Deepak Bandyopadhyay and
+Lutz Kettner at
+the Computational
+Geometry Group at UNC Chapel Hill.
+
+
+
+ Supported Systems
+
+Gzstream requires a standard compliant C++ compiler (we use the new
+header file conventions and the new iostream in the std:: name space)
+and, of course, zlib. We used zlib 1.1.3 so far, but see the zlib home page for why you should
+upgrade to zlib 1.1.4. So, in theory, the provided sources could run
+on many platforms. However, we used only the following few
+platforms.
+
+
+
+ - PC Linux, RedHat 6.1, g++ version 2.95.2
+
- PC Linux, Debian, g++ version 2.95.2 and 3.1
+
- SGI Irix 6.5, MIPSpro CC version 7.30
+
+
+
+
+ Installation
+
+Either compile gzstream.C by hand, place it in some library,
+and move gzstream.h into the include search path of your
+compiler. Or use the provided Makefile, adapt its
+variables, and follow the remarks in the Makefile. Two
+test programs are provided, test_gzip.C and test_gunzip.C.
+The Makefile contains a rule that performs a small test
+with these programs.
+
+
+
+ Documentation
+
+The library provides two classes, igzstream and ogzstream,
+that can be used analogously to ifstream and ofstream
+respectively.
+
+The classes are by default in the global name space. This can
+be changed by setting the macro GZSTREAM_NAMESPACE to
+the desired name space, e.g., by setting the option
+-DGZSTREAM_NAMESPACE=gz in the Makefile.
+However, this needs to be consistent for both, the library compilation
+and the application that uses the library.
+
+
+
+ What's Missing
+
+
+ - Seek. The zlib library provides the necessary functionality,
+ but we have not realized that in the wrapper (yet? ;-).
+
- Both streams are based on the same streambuffer. So, they
+ cannot be used to derive an iogzstream class that would allow
+ simultaneous reading and writing to the same file.
+
+
+
+
+ Download and Release Notes
+
+
+ - Gzstream library 1.5 (08 Apr 2003):
+ gzstream.tgz
+ Fixed bug that did not set the state correctly on failure to open or
+ close a file.
+ Fixed bug in the indexing of the write buffer that
+ caused the write buffer to shrink continously and finally caused
+ wrong results when writing compressed files (only observed on some
+ platforms).
+
- Gzstream library 1.4 (27 Apr 2002):
+ Fixed a bug that stopped stream output after calling flush()
+ or using std::endl.
+
- Gzstream library 1.3 (06 Nov 2001):
+ Fixed unsigned char -- signed char bug. Increased buffer size
+ for better performance.
+
- Gzstream library 1.2 (04 Oct 2001):
+ Initial release as gzstream, renamed from zipstream.
+
- Zipstream library 1.1 (09 Sep 2001):
+ Initial release.
+
+
+
+ Acknowledgements
+
+Credits for finding bugs and improving this software go to:
+Vincent Ricard, Peter Milley, Peter J. Torelli, and Ares Lagae.
+
+
+
+ Links
+
+
+
+
+
+ The Computational Geometry Group at UNC Chapel Hill, Jan. 08, 2003.
+
+
+
+
diff --git a/gzstream/logo.gif b/gzstream/logo.gif
new file mode 100644
index 0000000..e259089
Binary files /dev/null and b/gzstream/logo.gif differ
diff --git a/gzstream/test_gunzip.C b/gzstream/test_gunzip.C
new file mode 100644
index 0000000..6c4a7d7
--- /dev/null
+++ b/gzstream/test_gunzip.C
@@ -0,0 +1,78 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+// ============================================================================
+//
+// File : test_gunzip.C
+// Revision : $Revision: 1.3 $
+// Revision_date : $Date: 2001/10/04 15:09:28 $
+// Author(s) : Deepak Bandyopadhyay, Lutz Kettner
+//
+// Short test program reading a file, uncompressing it, and writing it.
+// ============================================================================
+
+#include
+#include
+#include
+#include
+
+int main( int argc, char*argv[]) {
+ if ( argc != 3) {
+ std::cerr << "Usage: " << argv[0] <<" \n";
+ return EXIT_FAILURE;
+ }
+ // check alternate way of opening file
+ igzstream in2;
+ in2.open( argv[1]);
+ if ( ! in2.good()) {
+ std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ in2.close();
+ if ( ! in2.good()) {
+ std::cerr << "ERROR: Closing file `" << argv[1] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ // now use the shorter way with the constructor to open the same file
+ igzstream in( argv[1]);
+ if ( ! in.good()) {
+ std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ std::ofstream out( argv[2]);
+ if ( ! out.good()) {
+ std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ char c;
+ while ( in.get(c))
+ out << c;
+ in.close();
+ out.close();
+ if ( ! in.eof()) {
+ std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ if ( ! out.good()) {
+ std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ return EXIT_SUCCESS;
+}
+
+// ============================================================================
+// EOF
diff --git a/gzstream/test_gzip.C b/gzstream/test_gzip.C
new file mode 100644
index 0000000..0c691ae
--- /dev/null
+++ b/gzstream/test_gzip.C
@@ -0,0 +1,78 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+// ============================================================================
+//
+// File : test_gzip.C
+// Revision : $Revision: 1.3 $
+// Revision_date : $Date: 2001/10/04 15:09:28 $
+// Author(s) : Deepak Bandyopadhyay, Lutz Kettner
+//
+// Short test program reading a file, compressing it, and writing it.
+// ============================================================================
+
+#include
+#include
+#include
+#include
+
+int main( int argc, char*argv[]) {
+ if ( argc != 3) {
+ std::cerr << "Usage: " << argv[0] <<" \n";
+ return EXIT_FAILURE;
+ }
+ // check alternate way of opening file
+ ogzstream out2;
+ out2.open( argv[2]);
+ if ( ! out2.good()) {
+ std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ out2.close();
+ if ( ! out2.good()) {
+ std::cerr << "ERROR: Closing file `" << argv[2] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ // now use the shorter way with the constructor to open the same file
+ ogzstream out( argv[2]);
+ if ( ! out.good()) {
+ std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ std::ifstream in( argv[1]);
+ if ( ! in.good()) {
+ std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ char c;
+ while ( in.get(c))
+ out << c;
+ in.close();
+ out.close();
+ if ( ! in.eof()) {
+ std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ if ( ! out.good()) {
+ std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n";
+ return EXIT_FAILURE;
+ }
+ return EXIT_SUCCESS;
+}
+
+// ============================================================================
+// EOF
diff --git a/gzstream/version b/gzstream/version
new file mode 100644
index 0000000..511137d
--- /dev/null
+++ b/gzstream/version
@@ -0,0 +1 @@
+1.5 (08 Jan 2003)
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000..aeba443
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,631 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "reads.h"
+#include "dbseq.h"
+#include "align.h"
+#include "param.h"
+#include "pairs.h"
+#include "utilities.h"
+
+#ifdef THREAD
+#include
+#endif
+
+using namespace std;
+
+//global variables
+Param param;
+string query_a_file;
+string query_b_file;
+string ref_file;
+string out_align_file;
+string command_line;
+
+ifstream fin_db; igzstream gzfin_db;
+ifstream fin_a; igzstream gzfin_a;
+ifstream fin_b; igzstream gzfin_b;
+ofstream fout;
+FILE *pout;
+ReadClass read_a, read_b;
+RefSeq ref;
+
+bit32_t n_aligned=0, n_unique=0, n_multiple=0; //number of reads aligned
+bit32_t n_aligned_pairs=0, n_unique_pairs=0, n_multiple_pairs=0; //number of pairs aligned
+bit32_t n_aligned_a=0, n_unique_a=0, n_multiple_a=0; //number of a reads aligned
+bit32_t n_aligned_b=0, n_unique_b=0, n_multiple_b=0; //number of b reads aligned
+bit32_t ref_time, read_time;
+bit16_t tid[64];
+char version[] = "2.90";
+ostringstream message;
+
+void info(int level) {
+ if(level<=param.verbose_level) cerr< pthread_ids(param.num_procs);
+ for(int i=0; i0) param.max_snp_num=2;
+ vector pthread_ids(param.num_procs);
+ //create
+ //cout < query a file, FASTA/FASTQ/BAM format\n"
+ <<" -d reference sequences file, FASTA format\n"
+ <<" -o output alignment file, BSP/SAM/BAM format, if omitted, the output will be written to STDOUT in SAM format.\n"
+ <<"\n Options for alignment:\n"
+ <<" -s seed size, default=16(WGBS mode), 12(RRBS mode). min=8, max=16.\n"
+ <<" -v if this value is between 0 and 1, it's interpreted as the mismatch rate w.r.t to the read length.\n"
+ <<" otherwise it's interpreted as the maximum number of mismatches allowed on a read, <="< gap size, BSMAP only allow 1 continuous gap (insert or deletion) with up to "< maximum number of equal best hits to count, <="< start from the Nth read or read pair, default: 1\n"
+ <<" -E end at the Nth read or read pair, default: 4,294,967,295\n"
+ <<" -I index interval, default="< set the cut-off ratio for over-represented kmers, default="< number of processors to use, default="< activating RRBS mapping mode and set restriction enzyme digestion sites. \n"
+ <<" digestion position marked by \'-\', example: -D C-CGG for MspI digestion.\n"
+ <<" default: none (whole genome shotgun bisulfite mapping mode)\n"
+ <<" -S seed for random number generation used in selecting multiple hits\n"
+ <<" other seed values generate pseudo random number based on read index number, to allow reproducible mapping results. \n"
+ <<" default="< set alignment information for the additional nucleotide transition. \n"
+ <<" is in the form of two different nucleotides N1N2, \n"
+ <<" indicating N1 in the reads could be mapped to N2 in the reference sequences.\n"
+ <<" default: -M TC, corresponds to C=>U(T) transition in bisulfite conversion. \n"
+ <<" example: -M GA could be used to detect A=>I(G) transition in RNA editing. \n"
+ <<"\n Options for trimming:\n"
+ <<" -q quality threshold in trimming, 0-40, default=0 (no trim)\n"
+ <<" -z base quality, default="<<(int) param.zero_qual<<" [Illumina is using 64, Sanger Institute is using 33]\n"
+ <<" -f filter low-quality reads containing >n Ns, default="< 3-end adapter sequence, default: none (no trim)\n"
+ <<" -L map the first N nucleotides of the read, default:"< query b file\n"
+ <<" -m minimal insert size allowed, default="< maximal insert size allowed, default="<MAXGAPS) {
+ cerr<<"warning: gap length exceeds max value:"<2||param.report_repeat_hits<0) {
+ cerr<<"invalid -r value: "<2||param.verbose_level<0) {
+ cerr<<"invalid -V value: "<16) {cerr<<"index interval exceeds max value:16\n"; exit(1);}
+ break;
+ case 'k': if(rgv[i][2]==0) param.max_kmer_ratio=atof(rgv[++i]); else if(rgv[i][2]=='=') param.max_kmer_ratio=atof(rgv[i]+3); else return i; break;
+ case 'v':
+ double tmp_max_snp;
+ if(rgv[i][2]==0) tmp_max_snp = atof(rgv[++i]); else if(rgv[i][2]=='=') tmp_max_snp=atof(rgv[i]+3); else return i;
+ if(tmp_max_snp<1.0) {
+ param.max_snp_num=(int)(tmp_max_snp*100+0.5)+100;
+ if(param.max_snp_num==100) param.max_snp_num=0;
+ }
+ else{
+ param.max_snp_num=(int)(tmp_max_snp+0.5);
+ if(param.max_snp_num>MAXSNPS) {
+ cerr<<"warning: number of mismatches exceeds max value:"<MAXHITS) {cerr<<"number of multi-hits exceeds max value:"<>s1; gg.getline(ch, 1000);
+ if(s1[0]=='>') {message<<" \t(format: gzipped FASTA)\n"; info(1); return 0;} //fasta
+ if(s1[0]=='@') {message<<" \t(format: gzipped FASTQ)\n"; info(1); return 1;} //fastq
+ if(samopen(filename.c_str(), "rb", 0)!=0) {message<<" \t(format: BAM)\n"; info(1); return 3;} //BAM
+ if(samopen(filename.c_str(), "r", 0)!=0) {message<<" \t(format: SAM)\n"; info(1); return 2;} //SAM
+ }
+ else {
+ ff.open(filename.c_str());
+ ff>>s1; ff.getline(ch, 1000);
+ if(s1[0]=='>') {message<<" \t(format: FASTA)\n"; info(1); return 0;} //fasta
+ if(s1[0]=='@') {message<<" \t(format: FASTQ)\n"; info(1); return 1;} //fastq
+ if(samopen(filename.c_str(), "rb", 0)!=0) {message<<" \t(format: BAM)\n"; info(1); return 3;} //BAM
+ if(samopen(filename.c_str(), "r", 0)!=0) {message<<" \t(format: SAM)\n"; info(1); return 2;} //SAM
+ }
+ cerr<<"\t(format: unknown)\nUnknown input format.\n";
+ exit(1);
+}
+
+void RunProcess(void) {
+ char _ch[256]; string _str="@HD\tVN:1.0\n";
+ if(out_align_file.size()>4){
+ if(out_align_file.compare(out_align_file.size()-4,4,".sam")==0) param.out_sam=1;
+ else if (out_align_file.compare(out_align_file.size()-4,4,".bam")==0) param.out_sam=2;
+ }
+ else param.out_sam=1;
+
+ if(param.max_snp_num<100) message<<"\tmax number of mismatches: "< "< "< "
+parser = optparse.OptionParser(usage=usage)
+
+parser.add_option("-o", "--out", dest="outfile", metavar="FILE", help="output differential methylation file name. (required)", default="")
+parser.add_option("-d", "--ref", dest="reffile", metavar="FILE", help="reference genome fasta file. (required)", default="")
+parser.add_option("-b", "--bin", dest="bin", type="int", metavar='BIN', help="bin size. [default: 100]", default=100)
+parser.add_option("-p", "--pval", dest="pval", type="float", metavar='PVAL', help="p-value cut-off. [default: 0.01]", default=0.01)
+parser.add_option("-r", "--diff", dest="diff", type="float", metavar='DIFF', help="minimal abs meth ratio difference. [default: 0.33]", default=0.33)
+parser.add_option("-x", "--context", dest="context", metavar='TYPE', help="methylation pattern type [CG|CHG|CHH], multiple types separated by ','. [default: all]", default='')
+parser.add_option("-l", "--labels", dest="labels", metavar='LABELS', help="output label for each group, separated by ','. [default: filenames]", default='')
+parser.add_option("-m", "--min-depth", dest="min_depth", type="int", metavar='FOLD', help="minimal average coverage. [default: 1]", default=1.)
+parser.add_option("-s", "--strand", dest="strand", metavar='STRAND', help="which strand to process, [both|forward|reverse]. [default: both]", default='both')
+parser.add_option("-q", "--quiet", action="store_true", dest="quiet", help="don't print progress on stderr.", default=False)
+
+options, infiles = parser.parse_args()
+
+assert options.reffile, "Require reference file name, use -d/--ref option."
+assert options.outfile, "Require output file name, use -o/--out option."
+assert len(infiles) >= 2, "Require at least two METHRATIO_FILE groups."
+infiles = [x.split(',') for x in infiles]
+if len(options.context) > 0: options.context = options.context.split(',')
+if len(options.labels) == 0: labels = ['group%d' % i for i in xrange(len(infiles))]
+else: labels = options.labels.split(',')
+assert len(labels) == len(infiles), "# labels not equal to # groups."
+strand_dict = {'both': ('+-', 'CG'), 'forward': ('+', 'C'), 'reverse': ('-', 'G')}
+assert options.strand in strand_dict, "Invalid strand: %s" % options.strand
+strands, nts = strand_dict[options.strand]
+
+PVAL = [1,0.2,0.1,0.05,0.02,0.01,0.005,0.002,0.001,5e-4,2e-4,1e-4,5e-05,2e-05,1e-05,5e-06,1e-06,5e-07,1e-07,5e-08,1e-08,5e-09,1e-09,5e-10,1e-10,1e-11,\
+ 1e-12,1e-13,1e-14,1e-15,1e-16,1e-17,1e-18,1e-19,1e-20,1e-22,1e-24,1e-26,1e-28,1e-30,1e-32,1e-34,1e-36,1e-38,1e-40,1e-42,1e-44,1e-46,1e-48,1e-50,1e-55,\
+ 1e-60,1e-65,1e-70,1e-75,1e-80,1e-85,1e-90,1e-95,1e-100,1e-110,1e-120,1e-130,1e-140,1e-150,1e-160,1e-170,1e-180,1e-190,1e-200,1e-220,1e-240,1e-260,1e-280,1e-300,0]
+ZVAL = [0.,1.28,1.65,1.96,2.33,2.58,2.81,3.09,3.29,3.48,3.72,3.89,4.06,4.26,4.42,4.56,4.89,5.03,5.33,5.45,5.73,5.85,6.11,6.22,6.47,6.81,7.13,7.44,7.74,\
+ 8.03,8.30,8.57,8.84,9.09,9.34,9.81,10.27,10.70,11.12,11.52,11.91,12.29,12.66,13.02,13.36,13.70,14.03,14.35,14.67,14.98,15.73,16.44,17.12,17.78,18.41,\
+ 19.03,19.62,20.20,20.76,21.31,22.36,23.36,24.33,25.25,26.15,27.01,27.85,28.67,29.46,30.23,31.71,33.13,34.49,35.80,37.07,40.]
+
+zleft, zright = 0, len(PVAL)-1
+while zleft < zright - 1:
+ zmid = (zleft + zright) / 2
+ if PVAL[zmid] < options.pval: zright = zmid
+ else: zleft = zmid
+z0 = ZVAL[zleft]
+
+def disp(txt, nt=0):
+ if not options.quiet: print >> sys.stderr, '[methdiff] @%s \t%s' %(time.asctime(), txt)
+
+disp('reading reference file: %s ...' % options.reffile)
+ref, cr = {}, ''
+for line in open(options.reffile):
+ if line[0] == '>':
+ if cr != '': ref[cr] = refcr.upper()
+ cr, refcr = line[1:].strip(), ''
+ else: refcr += line[:-1]
+
+ref[cr] = refcr.upper()
+
+def get_chrom(group, sample, cr):
+ line, fin = lines[group][sample], fins[group][sample]
+ mm, dd = meth[group], depth[group]
+ while len(line) > 0:
+ col = line.split('\t', 7)
+ if col[2] not in strands:
+ line = fin.readline()
+ continue
+ if col[0] != cr: break
+ pos = int(col[1]) - 1
+ if len(options.context) > 0:
+ if col[3] not in options.context:
+ line = fin.readline()
+ continue
+ mm[pos/options.bin] += float(col[6])
+ dd[pos/options.bin] += float(col[5])
+ line = fin.readline()
+ lines[group][sample] = line
+ return
+
+def conf_intv(m, d, z):
+ p, z2 = m / d, z * z
+ pmid = p + z2 / (2 * d)
+ span = z * (p * (1 - p) / d + z2 / (4 * d * d)) ** 0.5
+ denorm = 1 + z2 / d
+ return ((pmid-span)/denorm, (pmid+span)/denorm)
+
+def get_pval(m0, d0, m1, d1):
+ l0, u0 = conf_intv(m0, d0, z0)
+ l1, u1 = conf_intv(m1, d1, z0)
+ if l0 < u1 and l1 < u0: return 1
+ left, right = zleft, len(ZVAL)-1
+ while right - left > 1:
+ mid = (left + right) / 2
+ zm = ZVAL[mid]
+ l0, u0 = conf_intv(m0, d0, zm)
+ l1, u1 = conf_intv(m1, d1, zm)
+ if l0 >= u1 or l1 >= u0: left = mid
+ else: right = mid
+ return PVAL[left]
+
+def cmp_chrom(cr):
+ m, d = [0. for i in infiles], [0. for i in infiles]
+ refcr = ref[cr]
+ for pos in xrange(len(refcr)/options.bin+1):
+ for i in xrange(len(infiles)):
+ m[i] = meth[i][pos]
+ d[i] = depth[i][pos]
+ if d[0] * d[1] == 0.: continue
+ r0, r1 = m[0]/d[0], m[1]/d[1]
+ if abs(r0-r1) < options.diff: continue
+ start, end = pos*options.bin, (pos+1)*options.bin
+ nc = sum(refcr.count(nt, start, end) for nt in nts)
+ if d[0] / nc < options.min_depth or d[1] / nc < options.min_depth: continue
+ pval = get_pval(m[0], d[0], m[1], d[1])
+ if pval > options.pval: continue
+ fout.write('%s\t%d\t%d\t%.1g\t%.3f\t%.3f\t%.1f\t%.1f\t%.3f\t%.1f\t%.1f\n' % (cr,start+1,end,pval,r0-r1,r0,d[0],m[0],r1,d[1],m[1]))
+
+def xopen(filename):
+ if filename[-3:].lower() == '.gz': return os.popen('zcat %s' % filename)
+ else: return open(filename)
+
+meth = [0 for infile in infiles]
+depth = [0 for infile in infiles]
+fins = [[xopen(infile) for infile in group] for group in infiles]
+lines = [[fin.readline() for fin in group] for group in fins]
+fout = open(options.outfile, 'w')
+name0, name1 = labels[0], labels[1]
+fout.write('cr\tstart\tend\tp_value\tdiff_ratio\t%s_ratio\t%s_depth\t%s_meth\t%s_ratio\t%s_depth\t%s_meth\n' % (name0,name0,name0,name1,name1,name1))
+for cr in sorted(ref.keys()):
+ disp('processing %s ...' % cr)
+ for g, group in enumerate(infiles):
+ meth[g] = array.array('f', [0.]) * (len(ref[cr]) / options.bin + 1)
+ depth[g] = array.array('f', [0.]) * (len(ref[cr]) / options.bin + 1)
+ for i in xrange(len(group)): get_chrom(g, i, cr)
+ cmp_chrom(cr)
+
+fout.close()
+disp('done.')
diff --git a/methratio.py b/methratio.py
new file mode 100755
index 0000000..dd249c1
--- /dev/null
+++ b/methratio.py
@@ -0,0 +1,257 @@
+#! /usr/bin/env python
+import sys, time, os, array, optparse
+usage = "usage: %prog [options] BSMAP_MAPPING_FILES"
+parser = optparse.OptionParser(usage=usage)
+
+parser.add_option("-o", "--out", dest="outfile", metavar="FILE", help="output methylation ratio file name. [default: STDOUT]", default="")
+parser.add_option("-O", "--alignment-copy", dest="alignfile", metavar="FILE", help="save a copy of input alignment for BSMAP pipe input. (in BAM format) [default: none]", default="")
+parser.add_option("-w", "--wig", dest="wigfile", metavar="FILE", help="output methylation ratio wiggle file. [default: none]", default="")
+parser.add_option("-b", "--wig-bin", dest="wigbin", type="int", metavar='BIN', help="wiggle file bin size. [default: 25]", default=25)
+parser.add_option("-d", "--ref", dest="reffile", metavar="FILE", help="reference genome fasta file. (required)", default="")
+parser.add_option("-c", "--chr", dest="chroms", metavar="CHR", help="process only specified chromosomes, separated by ','. [default: all]\nexample: --chroms=chr1,chr2", default=[])
+parser.add_option("-s", "--sam-path", dest="sam_path", metavar="PATH", help="path to samtools. [default: none]", default='')
+parser.add_option("-u", "--unique", action="store_true", dest="unique", help="process only unique mappings/pairs.", default=False)
+parser.add_option("-p", "--pair", action="store_true", dest="pair", help="process only properly paired mappings.", default=False)
+parser.add_option("-z", "--zero-meth", action="store_true", dest="meth0", help="report loci with zero methylation ratios. (depreciated, -z will be always enabled)", default=True)
+parser.add_option("-q", "--quiet", action="store_true", dest="quiet", help="don't print progress on stderr.", default=False)
+parser.add_option("-r", "--remove-duplicate", action="store_true", dest="rm_dup", help="remove duplicated reads.", default=False)
+parser.add_option("-t", "--trim-fillin", dest="trim_fillin", type="int", metavar='N', help="trim N end-repairing fill-in nucleotides. [default: 0]", default=0)
+parser.add_option("-g", "--combine-CpG", action="store_true", dest="combine_CpG", help="combine CpG methylaion ratios on both strands.", default=False)
+parser.add_option("-m", "--min-depth", dest="min_depth", type="int", metavar='FOLD', help="report loci with sequencing depth>=FOLD. [default: 1]", default=1)
+parser.add_option("-n", "--no-header", action="store_true", dest="no_header", help="don't print a header line", default=False)
+parser.add_option("-i", "--ct-snp", dest="CT_SNP", help='how to handle CT SNP ("no-action", "correct", "skip"), default: "correct".', default="correct")
+parser.add_option("-x", "--context", dest="context", metavar='TYPE', help="methylation pattern type [CG|CHG|CHH], multiple types separated by ','. [default: all]", default='')
+
+options, infiles = parser.parse_args()
+
+if len(options.reffile) == 0: parser.error("Missing reference file, use -d or --ref option.")
+if len(infiles) == 0: parser.error("Require at least one BSMAP_MAPPING_FILE.")
+if len(options.chroms) > 0: options.chroms = options.chroms.split(',')
+CT_SNP_val = {"no-action": 0, "correct": 1, "skip": 2}
+try: options.CT_SNP = CT_SNP_val[options.CT_SNP.lower()]
+except: parser.error('Invalid -i value, select "no-action", "correct" or "skip"')
+if options.min_depth <= 0: parser.error('Invalid -m value, must >= 1')
+if options.trim_fillin < 0: parser.error('Invalid -t value, must >= 0')
+if len(options.context) > 0: options.context = options.context.split(',')
+
+if len(options.sam_path) > 0:
+ if options.sam_path[-1] != '/': options.sam_path += '/'
+
+def disp(txt, nt=0):
+ if not options.quiet: print >> sys.stderr, '[methratio] @%s \t%s' %(time.asctime(), txt)
+
+if len(options.outfile) == 0: disp("Missing output file name, write to STDOUT.")
+def get_alignment(line):
+ col = line.split('\t')
+ if sam_format:
+ if line[0] == '@': return []
+ flag = col[1]
+ if 'u' in flag: return []
+ if options.unique and 's' in flag: return []
+ if options.pair and 'P' not in flag: return []
+ cr, pos, cigar, seq, strand, insert = col[2], int(col[3])-1, col[5], col[9], '', int(col[8])
+ if cr not in options.chroms: return []
+ strand_index = line.find('ZS:Z:')
+ assert strand_index >= 0, 'missing strand information "ZS:Z:xx"'
+ strand = line[strand_index+5:strand_index+7]
+ gap_pos, gap_size = 0, 0
+ while 'I' in cigar or 'D' in cigar:
+ for sep in 'MID':
+ try: gap_size = int(cigar.split(sep, 1)[0])
+ except ValueError: continue
+ break
+ if sep == 'M': gap_pos += gap_size
+ elif sep == 'I': seq = seq[:gap_pos] + seq[gap_pos+gap_size:]
+ elif sep == 'D':
+ seq = seq[:gap_pos] + '-' * gap_size + seq[gap_pos:]
+ gap_pos += gap_size
+ cigar = cigar[cigar.index(sep)+1:]
+ else:
+ flag = col[3][:2]
+ if flag == 'NM' or flag == 'QC': return []
+ if options.unique and flag != 'UM': return []
+ if options.pair and col[7] == '0': return []
+ seq, strand, cr, pos, insert, mm = col[1], col[6], col[4], int(col[5])-1, int(col[7]), col[9]
+ if cr not in options.chroms: return []
+ if ':' in mm:
+ tmp = mm.split(':')
+ gap_pos, gap_size = int(tmp[1]), int(tmp[2])
+ if gap_size < 0: seq = seq[:gap_pos] + seq[gap_pos-gap_size:] # insertion on reference
+ else: seq = seq[:gap_pos] + '-' * gap_size + seq[gap_pos:]
+ if pos + len(seq) >= len(ref[cr]): return []
+ if options.rm_dup: # remove duplicate hits
+ if strand == '+-' or strand == '-+': frag_end, direction = pos+len(seq), 2
+ else: frag_end, direction = pos, 1
+ if coverage[cr][frag_end] & direction: return []
+ coverage[cr][frag_end] |= direction
+ if options.trim_fillin > 0: # trim fill in nucleotides
+ if strand == '+-' or strand == '-+': seq = seq[:-options.trim_fillin]
+ elif strand == '++' or strand == '--': seq, pos = seq[options.trim_fillin:], pos+options.trim_fillin
+ if sam_format and insert > 0: seq = seq[:int(col[7])-1-pos] # remove overlapped regions in paired hits, SAM format only
+ return (seq, strand[0], cr, pos)
+
+# open pipes to alignment files
+pipes = []
+for infile in infiles:
+ nline = 0
+ if infile.strip() == '-': sam_format, fin, infile = True, os.popen('%ssamtools view -XSh -' % options.sam_path), 'STDIN'
+ elif infile[-4:].upper() == '.SAM': sam_format, fin = True, os.popen('%ssamtools view -XS %s' % (options.sam_path, infile))
+ elif infile[-4:].upper() == '.BAM': sam_format, fin = True, os.popen('%ssamtools view -X %s' % (options.sam_path, infile))
+ else: sam_format, fin = False, open(infile)
+ pipes.append((sam_format,fin))
+
+# Read in chromosomes
+ref, cr, seq = {}, '', ''
+disp('loading reference file: %s ...' % options.reffile)
+for line in open(options.reffile):
+ if line[0] == '>':
+ if len(cr) > 0:
+ if len(options.chroms) == 0 or cr in options.chroms: ref[cr] = seq.upper()
+ cr, seq = line[1:-1].split()[0], ''
+ else: seq += line.strip()
+
+if len(options.chroms) == 0 or cr in options.chroms: ref[cr] = seq.upper()
+del seq
+
+meth, depth, coverage, meth1, depth1 = {}, {}, {}, {}, {}
+for cr in ref:
+ meth[cr] = array.array('H', [0]) * len(ref[cr])
+ depth[cr] = array.array('H', [0]) * len(ref[cr])
+ if options.rm_dup: coverage[cr] = array.array('B', [0]) * len(ref[cr])
+ if options.CT_SNP > 0:
+ meth1[cr] = array.array('H', [0]) * len(ref[cr])
+ depth1[cr] = array.array('H', [0]) * len(ref[cr])
+
+options.chroms = set(ref.keys())
+BS_conversion = {'+': ('C','T','G','A'), '-': ('G','A','C','T')}
+nmap = 0
+for sam_format, fin in pipes:
+ nline = 0
+ if len(options.alignfile) > 0: pout = os.popen('%ssamtools view -bS - > %s' % (options.sam_path, options.alignfile), 'w')
+ for line in fin:
+ if len(options.alignfile) > 0: pout.write(line)
+ nline += 1
+ if nline % 10000000 == 0: disp('read %d lines' % nline, nt=1)
+ map_info = get_alignment(line)
+ if len(map_info) == 0: continue
+ seq, strand, cr, pos = map_info
+ depthcr = depth[cr]
+ pos2 = pos + len(seq)
+ nmap += 1
+ methcr = meth[cr]
+ refseq = ref[cr]
+ match, convert, rc_match, rc_convert = BS_conversion[strand]
+ index = refseq.find(match, pos, pos2)
+ while index >= 0:
+ if seq[index-pos] == convert:
+ try: depthcr[index] += 1
+ except OverflowError: depthcr[index] = 65535
+ elif seq[index-pos] == match and depthcr[index] < 65535:
+ depthcr[index] += 1
+ methcr[index] += 1
+ index = refseq.find(match, index+1, pos2)
+ if options.CT_SNP == 0: continue
+ methcr1 = meth1[cr]
+ depthcr1 = depth1[cr]
+ index = refseq.find(rc_match, pos, pos2)
+ while index >= 0:
+ if seq[index-pos] == rc_convert:
+ try: depthcr1[index] += 1
+ except OverflowError: depthcr1[index] = 65535
+ elif seq[index-pos] == rc_match and depthcr1[index] < 65535:
+ depthcr1[index] += 1
+ methcr1[index] += 1
+ index = refseq.find(rc_match, index+1, pos2)
+
+ fin.close()
+ if len(options.alignfile) > 0: pout.close()
+
+disp('read %d lines' % nline, nt=1)
+if options.combine_CpG:
+ disp('combining CpG methylation from both strands ...')
+ for cr in depth:
+ depthcr, methcr, refcr = depth[cr], meth[cr], ref[cr]
+ if options.CT_SNP > 0: depthcr1, methcr1 = depth1[cr], meth1[cr]
+ pos = refcr.find('CG')
+ while pos >= 0:
+ try:
+ depthcr[pos] += depthcr[pos+1]
+ methcr[pos] += methcr[pos+1]
+ except OverflowError:
+ depthcr[pos] = (depthcr[pos] + depthcr[pos+1]) / 2
+ methcr[pos] = (methcr[pos] + methcr[pos+1]) / 2
+ depthcr[pos+1] = 0
+ methcr[pos+1] = 0
+ if options.CT_SNP > 0:
+ try:
+ depthcr1[pos] += depthcr1[pos+1]
+ methcr1[pos] += methcr1[pos+1]
+ except OverflowError:
+ depthcr1[pos] = (depthcr1[pos] + depthcr1[pos+1]) / 2
+ methcr1[pos] = (methcr1[pos] + methcr1[pos+1]) / 2
+ pos = refcr.find('CG', pos+2)
+
+if len(options.outfile) == 0: fout, outfile = sys.stdout, 'STDOUT'
+else: fout = open(options.outfile, 'w')
+disp('writing %s ...' % options.outfile)
+if options.wigfile:
+ fwig = open(options.wigfile, 'w')
+ fwig.write('track type=wiggle_0\n')
+if not options.no_header:
+ fout.write('chr\tpos\tstrand\tcontext\tratio\teff_CT_count\tC_count\tCT_count\trev_G_count\trev_GA_count\tCI_lower\tCI_upper\n')
+z95, z95sq = 1.96, 1.96 * 1.96
+nc, nd, dep0 = 0, 0, options.min_depth
+for cr in sorted(depth.keys()):
+ depthcr, methcr, refcr = depth[cr], meth[cr], ref[cr]
+ if options.CT_SNP > 0: depthcr1, methcr1 = depth1[cr], meth1[cr]
+ if options.wigfile:
+ fwig.write('variableStep chrom=%s span=%d\n' % (cr, options.wigbin))
+ bin = wigd = wigm = 0
+ for i, dd in enumerate(depthcr):
+ if dd < dep0: continue
+ if options.CT_SNP > 0:
+ m1, d1 = methcr1[i], depthcr1[i]
+ if m1 != d1:
+ if options.CT_SNP == 2: continue
+ d = float(dd) * m1 / d1
+ else: d = float(dd)
+ else: d = float(dd)
+ if refcr[i] == 'C':
+ strand = '+'
+ try:
+ if refcr[i+1] == 'G': seq = 'CG'
+ elif refcr[i+2] == 'G': seq = 'CHG'
+ else: seq = 'CHH'
+ except IndexError: continue
+ else:
+ strand = '-'
+ if i == 0: continue
+ if refcr[i-1] == 'C': seq = 'CG'
+ else:
+ if i == 1: continue
+ if refcr[i-2] == 'C': seq = 'CHG'
+ else: seq = 'CHH'
+ if len(options.context) > 0:
+ if seq not in options.context: continue
+ m = methcr[i]
+ try: ratio = min(m, d) / d
+ except ZeroDivisionError: continue
+ nc += 1
+ nd += d
+ if options.wigfile:
+ if i / options.wigbin != bin:
+ if wigd > 0: fwig.write('%d\t%.3f\n' % (bin*options.wigbin+1, min(wigm/wigd,1)))
+ bin = i / options.wigbin
+ wigd = wigm = 0.
+ wigd += d
+ wigm += m
+ pmid = ratio + z95sq / (2 * d)
+ sd = z95 * ((ratio*(1-ratio)/d + z95sq/(4*d*d)) ** 0.5)
+ denorminator = 1 + z95sq / d
+ CIl, CIu = (pmid - sd) / denorminator, (pmid + sd) / denorminator
+ if options.CT_SNP: fout.write('%s\t%d\t%c\t%s\t%.3f\t%.2f\t%d\t%d\t%d\t%d\t%.3f\t%.3f\n' % (cr, i+1, strand, seq, ratio, d, m, dd, m1, d1, CIl, CIu))
+ else: fout.write('%s\t%d\t%c\t%s\t%.3f\t%.2f\t%d\t%d\tNA\tNA\t%.3f\t%.3f\n' % (cr, i+1, strand, seq, ratio, d, m, dd, CIl, CIu))
+
+if options.outfile != 'STDOUT': fout.close()
+if options.wigfile: fwig.close()
+disp('total %d valid mappings, %d covered cytosines, average coverage: %.2f fold.' % (nmap, nc, float(nd)/nc))
diff --git a/pairs.cpp b/pairs.cpp
new file mode 100644
index 0000000..ea66a41
--- /dev/null
+++ b/pairs.cpp
@@ -0,0 +1,575 @@
+#include
+#include "pairs.h"
+
+using namespace std;
+
+extern Param param;
+
+PairAlign::PairAlign()
+{
+ n_aligned_pairs=n_aligned_a=n_aligned_b=0;
+ n_unique_pairs=n_unique_a=n_unique_b=0;
+ n_multiple_pairs=n_multiple_a=n_multiple_b=0;
+ pairhits = new PairArray[2*MAXSNPS+1];
+ _str_align.reserve(BatchNum*1024);
+ rand_rSeed=getpid()*time(NULL);
+}
+
+PairAlign::~PairAlign() {
+ delete [] pairhits;
+}
+
+void PairAlign::ImportBatchReads(bit32_t n, vector &a1, vector &a2)
+{
+ _sa.ImportBatchReads(n, a1);
+ _sb.ImportBatchReads(n, a2);
+ num_reads=n;
+}
+
+int PairAlign::GetPairs(bit32_t na, bit32_t nb)
+{
+ bit32_t i,j,insert_size,bstart,bend,npair=0;
+ ref_loc_t seg_start,seg_end;
+ if(na>_sa.read_max_snp_num||nb>_sb.read_max_snp_num) return 0;
+ PairHit pp;
+ pp.na=na;pp.nb=nb;
+ //_sa.SortHits4PE(na);
+ //_sb.SortHits4PE(nb);
+ ref_id_t chra;
+
+ //cout<<"na:"<=chra) break;
+ for(bend=bstart;bend<_sb._cur_n_chit[nb];bend++) if(_sb.chits[nb][bend].chr>chra) break;
+ }
+ //cout<<"\tchr:"<<(int)chra<<"\tbstart:"<seq.size();}
+ else {seg_start=_sa.hits[na][i].loc; seg_end=_sb.chits[nb][j].loc+_sb._pread->seq.size();}
+ insert_size=seg_end-seg_start;
+ //cout<<"seg_start:"<=param.min_insert)&&(insert_size<=param.max_insert)) {
+ //cout << "a: "<< (int)_sa.hits[na][i].chr << "\tb: "<<(int)_sb.chits[nb][j].chr<<"\tinsert: "<=param.max_num_hits) return npair;
+ }
+ }
+ }
+
+ //a- vs b+
+ pp.chain=1; chra=~0; bstart=0; bend=0;
+ //cout<<"a1b0\n";
+ for(i=0;i<_sa._cur_n_chit[na]; i++) {
+ if(chra!=_sa.chits[na][i].chr){
+ chra=_sa.chits[na][i].chr;
+ for(bstart=bend;bstart<_sb._cur_n_hit[nb];bstart++) if(_sb.hits[nb][bstart].chr>=chra) break;
+ for(bend=bstart;bend<_sb._cur_n_hit[nb];bend++) if(_sb.hits[nb][bend].chr>chra) break;
+ }
+ //cout<<"\tchr:"<seq.size();}
+ else {seg_start=_sa.chits[na][i].loc; seg_end=_sb.hits[nb][j].loc+_sb._pread->seq.size();}
+ insert_size=seg_end-seg_start;
+ //cout<<"seg_start:"<=param.min_insert)&&(insert_size<=param.max_insert)) {
+ //cout << "a: "<< (int)_sa.chits[na][i].chr << "\tb: "<<(int)_sb.hits[nb][j].chr<<"\tinsert: "<