NCSv2.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
<process>
  <name>NCS</name>
  <version>2</version>
  <author>
    <name>Karl Nordström</name>
    <email>karl.nordstroem@uni-saarland.de</email>
  </author>
  <description>The MCSv3 pipeline applies a slightly extended bis-SNP pipeline. It is parallelized over regions defined by the {ASSEMBLY} parameter with one region for each normal chromosome. The non-localized sequences and the mitochondria is considered as one region. The steps applied are; 1. Reordering and extraction of mapped reads, 2. Local realignment at known variations, 3. Clipping of overlapping parts between paired end reads, 4. Quality recalibration, 5. Methylation calls, 6. Filtering of calls, 7. Merging and generation of output data, 8. Generation of technical metadata</description>
  <inputs>
    <filetype>
      <identifier>INPUTFILE</identifier>
      <format>BALNv0.bam</format>
      <quantity>single</quantity>
      <comment>No comment</comment>
    </filetype>
    <filetype>
        <identifier>INPUTFILE_PICARDDUPMETRICS</identifier>
        <format>BALNv0.PicardMarkDupmetrics.txt</format>
        <quantity>single</quantity>
        <comment>No comment</comment>
    </filetype>
    <filetype>
        <identifier>INPUTFILE_FLAGSTATS</identifier>
        <format>BALNv0.flagstats.txt</format>
        <quantity>single</quantity>
        <comment>No comment</comment>
    </filetype>
  </inputs>
  <references>
    <filetype>
      <identifier>KNOWN_INDELS</identifier>
      <format>vcf</format>
      <quantity>single</quantity>
      <comment>Sorted in the same order as the chromosomes in {REFERENCE}</comment>
    </filetype>
    <filetype>
        <identifier>KNOWN_SNPS</identifier>
        <format>vcf</format>
        <quantity>single</quantity>
        <comment>Sorted in the same order as the chromosomes in {REFERENCE}</comment>
    </filetype>
    <filetype>
        <identifier>REFERENCE</identifier>
        <format>FASTA</format>
        <quantity>single</quantity>
        <comment>fasta file containing genomic sequence with chromosomes in the same order as {KNOWN_INDELS} and {KNOWN_SNPS}</comment>
    </filetype>
    <filetype>
        <identifier>REFERENCE_FAI</identifier>
        <format>fai</format>
        <quantity>single</quantity>
        <comment>fasta index for the {REFERENCE} file, as generated by samtools faidx</comment>
    </filetype>
    <filetype>
        <identifier>REFERENCE_LENGTHS</identifier>
        <format>tsv</format>
        <quantity>single</quantity>
        <comment>A tab separated file containing the lengths of all chromosomes in the {REFERENCE} file</comment>
    </filetype>
  </references>
  <outputs>
    <filetype>
      <identifier>{OUTNAME}.bam</identifier>
      <format>bam</format>
      <quantity>single</quantity>
      <comment>A bam file containing the reads after all steps in the pipeline</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.bam.bai</identifier>
        <format>bai</format>
        <quantity>single</quantity>
        <comment>No comment</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.GCH.bed.gz</identifier>
        <format>bed.gz</format>
        <quantity>single</quantity>
        <comment>gzipped bed file in bisSNP format containing all GpCH sites</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.GCH.bed.gz.tbi</identifier>
        <format>bed.gz.tbi</format>
        <quantity>single</quantity>
        <comment>tabix index</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.vcf.gz</identifier>
        <format>vcf.gz</format>
        <quantity>single</quantity>
        <comment>A gzipped vcf file containing all called cytosines in the genome</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.vcf.gz.tbi</identifier>
        <format>vcf.gz.tbi</format>
        <quantity>single</quantity>
        <comment>tabix index</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.snp.filtered.vcf.gz</identifier>
        <format>vcf.gz</format>
        <quantity>single</quantity>
        <comment>A gzipped vcf file containing all called SNPs</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.snp.filtered.vcf.gz.tbi</identifier>
        <format>vcf.gz.tbi</format>
        <quantity>single</quantity>
        <comment>tabix index</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.amd.tsv</identifier>
        <format>txt</format>
        <quantity>single</quantity>
        <comment>Text file containing the analysis metadata</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.filtered.GCH.bw</identifier>
        <format>bigwig</format>
        <quantity>single</quantity>
        <comment>Methylation calls for cytosines in GCH context</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.filtered.GCH.ct_coverage.bw</identifier>
        <format>bigwig</format>
        <quantity>single</quantity>
        <comment>Coverage of CT-reads for all cytosines in GCH context</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.filtered.GCH.ct_coverage.bw</identifier>
        <format>bigwig</format>
        <quantity>single</quantity>
        <comment>Coverage of CT-reads for all cytosines in GCH context</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.GCH.peaks.bed</identifier>
        <format>bed</format>
        <quantity>single</quantity>
        <comment>The complete set of NOMe peaks</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.GCH.peaks.fdr001.bed.gz</identifier>
        <format>bed.gz</format>
        <quantity>single</quantity>
        <comment>NOMe peaks with a FDR lower than 0.01</comment>
    </filetype>
    <filetype>
        <identifier>{OUTNAME}.cpg.filtered.GCH.peaks.fdr001.bed.gz.tbi</identifier>
        <format>bed.gz.tbi</format>
        <quantity>single</quantity>
        <comment>tabix index</comment>
    </filetype>
  </outputs>
  <software>
    <tool>
      <name>samtools</name>
      <version>1.2 (using htslib 1.2.1)</version>
      <command_line><![CDATA[ samtools view -F260 -u -b {INPUTFILE} > PIPE1.bam ]]></command_line>
      <loop>no looping</loop>
      <comment>-F260 discards all unmapped and non-primary reads. -u -b writes a uncompressed bam to stdout</comment>
    </tool>
    <tool>
        <name>Picard tools ReorderSam</name>
        <version>1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)</version>
        <command_line><![CDATA[ java -Djava.io.tmpdir={TMPDIR} -jar {JARS}/ReorderSam.jar MAX_RECORDS_IN_RAM=4000000 I=PIPE1.bam O=REORDER.bam R={REFERENCE} CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT ]]></command_line>
        <loop>no looping</loop>
        <comment>Reorders the bam file so that the order mirrors that of the reference files. MAX_RECORDS_IN_RAM is set to limit the number of files written to disk and VALIDATION_STRINGENCY is disabled.</comment>
    </tool>
    <tool>
        <name>bisSNP realignerTargetCreator</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx10G -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -I REORDER.bam -T BisulfiteRealignerTargetCreator -known {KNOWN_INDELS} -o REGION.indel_target.interval.intervals -nt 7 -S LENIENT -L REGION ]]></command_line>
        <loop>no looping</loop>
        <comment>This step is ran once for each REGION (see description) and extracts the regions that should be realigned. Each job uses 7 cores. The control of the bam format is relaxed with -S LENIENT.</comment>
    </tool>
    <tool>
        <name>bisSNP indelRealigner</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx10G -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -I REORDER.bam -T BisulfiteIndelRealigner -targetIntervals REGION.indel_target_interval.intervals -known {KNOWN_INDELS} -o REGION.realigned.bam -S LENIENT -cigar -L REGION --maxReadsInMemory 1500000 ]]></command_line>
        <loop>no looping</loop>
        <comment>Runs once for each REGION (see description). Realigns reads in regions with known indels. -cigar and -S LENIENT relaxes the demands on the bam file. --maxReadsInMemory is increased from 150000 to 1500000 in order to reduce the number of temporary files</comment>
    </tool>
    <tool>
        <name>Picard tools MergeSamFiles</name>
        <version>1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)</version>
        <command_line><![CDATA[ java -Djava.io.tmpdir={TMPDIR} -jar {JARS}/MergeSamFiles.jar (I=REGION.realigned.bam)+ O=PIPE2.bam MAX_RECORDS_IN_RAM=4000000 VALIDATION_STRINGENCY=SILENT COMPRESSION_LEVEL=0 ]]></command_line>
        <loop>no looping</loop>
        <comment>Merges all realigned bamfiles and pipes the result into the next step. ((I=REGION.realgned.bam)+ has to be expanded). MAX_RECORDS_IN_RAM is increased to reduce the number of files written to disk COMPRESSION_LEVEL=0 makes the output file uncompressed. This is done as this step is part pipes it's result to the next step.</comment>
    </tool>
    <tool>
        <name>bamUtil clipOverlap</name>
        <version>1.0.12</version>
        <command_line><![CDATA[ bam clipOverlap --poolSize 2000000 --in PIPE2.bam --out NOOVERLAP.preClean.sam ]]></command_line>
        <loop>no looping</loop>
        <comment>Removal of the overlapping part between to paired reads. poolSize is increased to better parse repeats and other high coverage regions</comment>
    </tool>
    <tool>
        <name>awk cleanup</name>
        <version>missing</version>
        <command_line><![CDATA[awk -vFS='\t' -vOFS='\t' '$0~/^@/ {print;next} $6!~/[MIDN]/ {$6="0M"$6} {print}' NOOVERLAP.preClean.sam > NOOVERLAP.sam]]></command_line>
        <loop>no looping</loop>
        <comment>When the fragment is too short and bamUtil soft trims a complete read, the format is not accepted by Picard tools. To some degree this can be fixed by adding a 0M state to the Cigar string.</comment>
    </tool>
    <tool>
        <name>samtools</name>
        <version>1.2 (using htslib 1.2.1)</version>
        <command_line><![CDATA[ samtools view -u -bS NOOVERLAP.sam > NOOVERLAP.bam ]]></command_line>
        <loop>no looping</loop>
        <comment>converting the corrected output to bam</comment>
    </tool>
    <tool>
        <name>samtools</name>
        <version>1.2 (using htslib 1.2.1)</version>
        <command_line><![CDATA[ samtools index NOOVERLAP.bam ]]></command_line>
        <loop>no looping</loop>
        <comment>indexing of the bam file</comment>
    </tool>
    <tool>
        <name>bisSNP recalibrateCountCovariates</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx10G -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -I NOOVERLAP.bam -T BisulfiteCountCovariates -knownSites {KNOWN_SNPS} -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -recalFile RECALFILE -nt 14 -S LENIENT ]]></command_line>
        <loop>no looping</loop>
        <comment>The first step of the quality recalibration. The -cov flags are set to default values as leaving them out can generate an error. 16 cores is used for this step.</comment>
    </tool>
    <tool>
        <name>bisSNP recalibrate</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx12G -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -I NOOVERLAP.bam -T BisulfiteTableRecalibration -o REGION.recal.bam -recalFile RECALFILE -S LENIENT -L REGION ]]></command_line>
        <loop>no looping</loop>
        <comment>For each defined REGION (see description), the alignments are recalibrated.</comment>
    </tool>
    <tool>
        <name>bisSNP methylation calls</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx10G -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -T BisulfiteGenotyper -I REGION.recal.bam -D {KNOWN_SNPS} -vfn1 REGION.cpg.raw.vcf -vfn2 REGION.snp.raw.vcf -L REGION -S LENIENT -nt 7 -out_modes EMIT_VARIANT_AND_CYTOSINES -stand_emit_conf 0 -C CG,1 -C CA,1 -C CC,1 -C CT,1 -C CAG,1 -C CHH,1 -C CHG,1 -C GC,2 -C GCH,2 -C GCG,2 -C HCG,2 -C HCH,2 -C HCA,2 -C HCC,2 -C HCT,2 ]]></command_line>
        <loop>no looping</loop>
        <comment>For each defined REGION (see description), call the level of methylation for each cytosine. All cytosines are reported, due to the -out_modes flag. All cytosines in the contexts set with the -C flag are marked in the cpg vcf file. The flag stand_emit_conf must be set to something else than stand_call_conf or the extraction of bed files won't work</comment>
    </tool>
    <tool>
        <name>bisSNP sortByRefAndCor</name>
        <version>missing</version>
        <command_line><![CDATA[ perl {SCRIPTFOLDER}/third-party/bis-SNP_Utils/sortByRefAndCor.pl --k 1 --c 2 --tmp {TMPDIR} REGION.{cpg,snp}.raw.vcf {REFERENCE_FAI} > REGION.{cpg,snp}.raw.sorted.vcf ]]></command_line>
        <loop>no looping</loop>
        <comment>For each defined REGION (see description), sort snp and cpg vcf files</comment>
    </tool>
    <tool>
        <name>bisSNP VCFpostprocess</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx10g -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -T VCFpostprocess -oldVcf REGION.snp.raw.sorted.vcf -newVcf REGION.snp.filtered.vcf -snpVcf REGION.snp.raw.sorted.vcf -o REGION.snp.raw.filter.summary.txt  ]]></command_line>
        <loop>no looping</loop>
        <comment>Apply quality filter to SNPs</comment>
    </tool>
    <tool>
        <name>bisSNP VCFpostprocess</name>
        <version>BisSNP-0.82.2</version>
        <command_line><![CDATA[ java -Xmx10g -Djava.io.tmpdir={TMPDIR} -jar {BISSNPJAR} -R {REFERENCE} -T VCFpostprocess -oldVcf REGION.cpg.raw.sorted.vcf -newVcf REGION.cpg.filtered.vcf -snpVcf REGION.snp.raw.sorted.vcf -o REGION.cpg.raw.filter.summary.txt  ]]></command_line>
        <loop>no looping</loop>
        <comment>Apply quality filter to CpGs</comment>
    </tool>
    <tool>
        <name>vcf2bed modified</name>
        <version>missing</version>
        <command_line><![CDATA[ perl {SCRIPTFOLDER}/tools/vcf2bed.NOME.pl REGION.cpg.filtered.vcf GCH > REGION.cpg.filtered.GCH.bed ]]></command_line>
        <loop>no looping</loop>
        <comment>For each REGION (see description), generate a bisSNP bed file for each cytosine in a GCH context</comment>
    </tool>
    <tool>
        <name>samtools</name>
        <version>1.2 (using htslib 1.2.1)</version>
        <command_line><![CDATA[ samtools merge {OUTNAME}.bisSNP.bam REGION.recal.bam ]]></command_line>
        <loop>no looping</loop>
        <comment>merge the recalibrated bam files</comment>
    </tool>
    <tool>
        <name>samtools</name>
        <version>1.2 (using htslib 1.2.1)</version>
        <command_line><![CDATA[ samtools index {OUTNAME}.bisSNP.bam ]]></command_line>
        <loop>no looping</loop>
        <comment>index the output bam file</comment>
    </tool>
    <tool>
        <name>bash, bgzip and tabix</name>
        <version>missing</version>
        <command_line><![CDATA[ head -10000 `awk -vpre=CALLFOLDER -vsuf=SUFFIX -vORS=' ' 'NR==1 {print pre"/"$2suf}' {CHRSPLIT}` |grep "^#"  > {OUTNAME}SUFFIX.tmp.header && cat  `awk -vpre=CALLFOLDER -vsuf=SUFFIX -vORS=' ' '{print pre"/"$2suf}' {CHRSPLIT}` |grep -v "^#"| cat {OUTNAME}SUFFIX.tmp.header - > {OUTNAME}SUFFIX && bgzip {OUTNAME}SUFFIX && tabix -p vcf -S `cat {OUTNAME}SUFFIX.tmp.header |wc -l` {OUTNAME}SUFFIX.gz && rm {OUTNAME}SUFFIX.tmp.header ]]></command_line>
        <loop>no looping</loop>
        <comment>Merge the vcf files. SUFFIX loops over {.cpg.filtered.vcf, .snp.filtered.vcf} Begins by extracting the the header from the first file. The next step concatenates the files in the order defined by the {CHRSPLIT} file and add the header. The resulting vcf file is compressed with bgzip and indexed with tabix. CALLFOLDER is the folder with the recalibrated alignments</comment>
    </tool>
    <tool>
        <name>bash, bgzip and tabix</name>
        <version>missing</version>
        <command_line><![CDATA[ cat `awk -vpre=CALLFOLDER -vsuf=".cpg.filtered.GCH.bed" -vORS=' ' '{print pre"/"$2suf}' {CHRSPLIT}` |grep -v "^track" |cat <(echo track name=WGBS.{OUTNAME} type=bedDetail description=\"GCH methylation level\" visibility=3) - > {OUTNAME}.cpg.filtered.GCH.bed && bgzip {OUTNAME}.cpg.filtered.GCH.bed && tabix -p bed -S 1 {OUTNAME}.cpg.filtered.GCH.bed.gz ]]></command_line>
        <loop>no looping</loop>
        <comment>Merge GCH containing bed files in the order defined by the {CHRSPLIT} file, compress with bgzip and index with tabix. CALLFOLDER is the folder with the recalibrated alignments</comment>
    </tool>
    <tool>
        <name>bash, bedGraphToBigWig</name>
        <version>missing</version>
        <command_line><![CDATA[ zcat {OUTNAME}.cpg.filtered.GCH.bed.gz |tail -n +2 |cut -f 1-4 > {OUTNAME}.cpg.filtered.GCH.bed.tmp && bedGraphToBigWig {OUTNAME}.cpg.filtered.GCH.bed.tmp {REFERENCE_LENGTHS} {OUTNAME}.cpg.filtered.GCH.bw && rm {OUTNAME}.cpg.filtered.GCH.bed.tmp ]]></command_line>
        <loop>no looping</loop>
        <comment>Generate a bigWig track for the methylation rates</comment>
    </tool>
    <tool>
        <name>bash, bedGraphToBigWig</name>
        <version>missing</version>
        <command_line><![CDATA[ zcat {OUTNAME}.cpg.filtered.GCH.bed.gz |tail -n +2 |cut -f 1-3,5 > {OUTNAME}.cpg.filtered.GCH.bed.tmp2 && bedGraphToBigWig {OUTNAME}.cpg.filtered.GCH.bed.tmp2 {REFERENCE_LENGTHS} {OUTNAME}.cpg.filtered.GCH.ct_coverage.bw && rm {OUTNAME}.cpg.filtered.GCH.bed.tmp2 ]]></command_line>
        <loop>no looping</loop>
        <comment>Generate a bigWig track for the coverage</comment>
    </tool>
    <tool>
        <name>Metadata: duplication_rate</name>
        <version>version</version>
        <command_line><![CDATA[ awk -vOFS=$fs -vORS=$rs '$2=="METRICS" {state=1;next} state==1 {colNr=NF;state=2;next} state==2 && NF==colNr {curRead=$2+2*$3;totRead+=curRead;dupRead+=$5+2*$6}  END {printf "%sduplication_rate%s%s",ORS,OFS,dupRead/totRead}' {INPUTFILE_PICARDDUPMETRICS} ]]></command_line>
        <loop>no looping</loop>
        <comment>Parses the output of Picard tool's Mark Duplicate and calculates the average duplication rate for all included libraries. This is done by summing up the total number of reads and the number of duplicated reads, after which the quota is calculated. In the table, paired reads are counted as read pairs and the given duplication rate for each library is calculated on read basis. Hence, each read pair is counted as two reads while summing them up.</comment>
    </tool>
    <tool>
        <name>Metadata: num_reads</name>
        <version>missing</version>
        <command_line><![CDATA[ awk -vOFS=$fs -vORS=$rs 'NR==1 {printf "%snum_reads%s%s",ORS,OFS, $1+$3;exit}' {INPUTFILE_FLAGSTATS} ]]></command_line>
        <loop>no looping</loop>
        <comment>Extract the total number of reads, mapped and unmapped, from the samtools flagstat file</comment>
    </tool>
    <tool>
        <name>Metadata: num_mapped_reads</name>
        <version>missing</version>
        <command_line><![CDATA[ awk -vOFS=$fs -vORS=$rs 'NR==1 {tot=$1+$3} $4=="mapped" {printf "%snum_mapped_reads%s%s",ORS,OFS,$1+$3}' {INPUTFILE_FLAGSTATS} ]]></command_line>
        <loop>no looping</loop>
        <comment>Extract the number of mapped reads from the samtools flagstat file.</comment>
    </tool>
    <tool>
        <name>Metadata: num_SNPs</name>
        <version>missing</version>
        <command_line><![CDATA[ printf "${rs}num_SNPs${fs}%s" `zcat {OUTFOLDER}/{OUTNAME}.snp.filtered.vcf.gz | grep -v "^#" |cut -f 1,2|sort -u |wc -l` ]]></command_line>
        <loop>no looping</loop>
        <comment>counts the unique number of positions called as a SNP</comment>
    </tool>
    <tool>
        <name>Generation of paired CpG data</name>
        <version>missing</version>
        <command_line><![CDATA[ join -1 1 -2 1 -a 1 -a 2 <(zcat {OUTFOLDER}/{OUTNAME}.cpg.filtered.GCH.bed.gz |awk '$6=="+" {print $1"_"$2,$6,$1,$2,$4,$5}' |sort -k1,1) <(zcat {OUTFOLDER}/{OUTNAME}.cpg.filtered.GCH.bed.gz |awk '$6=="-" {print $1"_"($2-1),$6,$1,$2-1,$4,$5}' |sort -k1,1) |awk -vOFS='\t' 'NF==11 {print $3,$4,$5,$6,$10,$11;next} $2=="-" {print $3,$4,"NA",0,$5,$6;next} {print $3,$4,$5,$6,"NA",0}' > paired.data.csv ]]></command_line>
        <loop>no looping</loop>
        <comment>creates a temporary file with data from both strands of a CpG joined on a single line. This is done by subtracting one from the site on the negative strand and then do a full join on with the chromosome and position as key. The output has the format (chr,pos,meth+,cov+,meth-,cov-)</comment>
    </tool>
    <tool>
        <name>Metadata: cpg_cov_25, cpg_cov_50, cpg_cov_75, cpg_cov_mean</name>
        <version>missing</version>
        <command_line><![CDATA[ awk '{print $4+$6}' paired.data.csv |sort -g |awk -vOFS=$fs -vORS=$rs -vLINENR=`cat paired.data.csv |wc  -l` 'BEGIN {c25=int(0.25*LINENR);c50=int(0.5*LINENR);c75=int(0.75*LINENR)} NR==c25 {v25=$1} NR==c50 {v50=$1} NR==c75 {v75=$1} {c+=1;s+=$1} END{printf "%scpg_cov_25%s%s%scpg_cov_50%s%s%scpg_cov_75%s%s%scpg_cov_mean%s%s",ORS,OFS,v25,ORS,OFS,v50,ORS,OFS,v75,ORS,OFS,s/c}' paired.cov.sorted.csv ]]></command_line>
        <loop>no looping</loop>
        <comment>Extracts mean and 25,50 and 75% quantiles for the coverage of CpGs</comment>
    </tool>
    <tool>
        <name>Metadata: cpg_island_cov_mean</name>
        <version>missing</version>
        <command_line><![CDATA[ awk -vOFS='\t' '{print $1,$2,$2+2,$4+$6}' paired.data.csv |bedtools intersect -u -a - -b {CPG_ISLANDS} |awk -vOFS=$fs -vORS=$rs '{s+=$4;c+=1} END{printf "%scpg_island_cov_mean%s%s",ORS,OFS,s/c}' ]]></command_line>
        <loop>no looping</loop>
        <comment>Extracts all CpGs overlapping with the CpG island annotation and calculates a mean coverage</comment>
    </tool>
    <tool>
        <name>Metadata: num_called_cpgs</name>
        <version>missing</version>
        <command_line><![CDATA[ printf "${rs}num_called_cpgs${fs}%s" `zcat {OUTFOLDER}/{OUTNAME}.cpg.filtered.GCH.bed.gz |tail -n +2 |wc -l` ]]></command_line>
        <loop>no looping</loop>
        <comment>counts the number of cytosines in GCH-context that was called</comment>
    </tool>
    <tool>
        <name>Metadata: nrcpg_both, nrcpg_single</name>
        <version>missing</version>
        <command_line><![CDATA[ awk -vOFS=$fs -vORS=$rs '$4>0 && $6>0 {b++;next} {s++} END{printf "%snrcpg_both%s%s%snrcpg_single%s%s",ORS,OFS,b,ORS,OFS,s}' paired.data.csv ]]></command_line>
        <loop>no looping</loop>
        <comment>Counts the number of CpGs that are called on both or only on one strand</comment>
    </tool>
    <tool>
        <name>Metadata: nr_cpgSplit_count, nr_cpgSplit_chr</name>
        <version>missing</version>
        <command_line><![CDATA[ cut -f 1 {TMPDIR}/{OUTNAME}.paired.data.csv |sort |uniq -c |awk -vOFS=$fs -vORS=$rs 'NR==1 {count=$1;chr=$2;next} {count=count","$1;chr=chr","$2} END {printf "%snr_cpgSplit_count%s%s%snr_cpgSplit_chr%s%s",ORS,OFS,count,ORS,OFS,chr}' ]]></command_line>
        <loop>no looping</loop>
        <comment>Two arrays with the number of CpGs per chromosomes and the corresponding chromosomes</comment>
    </tool>
    <tool>
        <name>Metadata: mean_meth_*</name>
        <version>missing</version>
        <command_line><![CDATA[ zcat {OUTFOLDER}/{OUTNAME}.cpg.filtered.vcf.gz | java -Xmx12G -jar $JARS/fastaUtils.jar cContext ${REFERENCE} - CG,1 CA,1 CC,1 CT,1 CAG,1 CHH,1 CHG,1 GC,2 GCH,2 GCG,2 HCG,2 HCH,2 HCA,2 HCC,2 HCT,2 |awk -vOFS=$fs -vORS=$rs 'NR>1 {printf "%smean_meth_%s%s%s",ORS,tolower($1),OFS,$3+0}' ]]></command_line>
        <loop>no looping</loop>
        <comment>Parses the vcf file containing calls for all cytosines. For each site, the context is extracted with respect to the given reference genome. This omits polymorphisms, but should be a good estimate of the methylation rates. The contexts investigated is: CG, CA,CC, CT, CAG, CHH, CHG, GC, GCH, GCG, HCG, HCH, HCA, HCC and HCT.</comment>
    </tool>
    <tool>
        <name>Metadata: strandbias_meth</name>
        <version>missing</version>
        <command_line><![CDATA[ printf "${rs}strandbias_meth${fs}%s" "`awk '$0!~/NA/ {print $3"\t"$5}' paired.data.csv | Rscript {SCRIPTFOLDER}/tools/correlation.r`" ]]></command_line>
        <loop>no looping</loop>
        <comment>Calculates the pearson correlation between methylation rates from the two different strands. Loci with only one strand called are omitted.</comment>
    </tool>
    <tool>
        <name>Metadata: strandbias_cov</name>
        <version>missing</version>
        <command_line><![CDATA[ printf "${rs}strandbias_cov${fs}%s" "`awk '$0!~/NA/ {print $4"\t"$6}' $TMPDIR/$OUTNAME.paired.data.csv | Rscript $SCRIPTFOLDER/tools/correlation.r`" ]]></command_line>
        <loop>no looping</loop>
        <comment>Calculates the pearson correlation between coverage of the two different strands. Loci with only one strand called are omitted.</comment>
    </tool>
    <tool>
        <name>Peakcalling</name>
        <version>0.1</version>
        <command_line><![CDATA[ undefined]]></command_line>
        <loop>no looping</loop>
        <comment>This step is currently ran manually. Should be improved in a version 3 of the pipeline</comment>
    </tool>
  </software>
</process>