ADD: RBA and GAL; syntactically valid, some minor content-related issues

DEEP · Dec 30, 2016 · bd70b14 · bd70b14
1 parent 904a85a
commit bd70b14
Show file tree

Hide file tree

Showing 2 changed files with 430 additions and 0 deletions.
diff --git a/docs/alignment/bisulfite/RBAv0.xml b/docs/alignment/bisulfite/RBAv0.xml
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
+<process>
+  <name>RBA</name>
+  <version>0</version>
+  <author>
+    <name>Karl Nordström, Charles Imbusch</name>
+    <email>karl.nordstroem@uni-saarland.de</email>
+  </author>
+  <description>
+  The RBAv0 pipeline is a cloned version of the DEEP BAL process. It trims and aligns RRBS data to a reference genome.
+
+  0. Generation of MethylCtools reference index
+  1. trim reads with Trim Galore! (Cutadapt)
+  2. Map reads with MethylCtools (BWA)
+  3. Merge bam files with Picard tools
+  4. Generate a flagstat file
+
+  Step 0 is run manually and only once.
+  </description>
+  <inputs>
+    <filetype>
+      <identifier>sampleID_R1.fastq.gz</identifier>
+      <format>FASTQ</format>
+      <quantity>collection</quantity>
+      <comment>The current implementation takes a folder as input and trims and maps all fastq files in the folder</comment>
+    </filetype>
+  </inputs>
+  <references>
+    <filetype>
+      <identifier>{ASSEMBLY}.fa</identifier>
+      <format>FASTA</format>
+      <quantity>single</quantity>
+      <comment>fasta file containing genomic reference sequence</comment>
+    </filetype>
+  </references>
+  <outputs>
+    <filetype>
+      <identifier>{OUTNAME}.bam</identifier>
+      <format>BAM</format>
+      <quantity>single</quantity>
+      <comment>The resulting alignment in BAM format</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.bam.bai</identifier>
+    	<format>BAI</format>
+    	<quantity>single</quantity>
+    	<comment>Index file for the alignment</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.coverage.bw</identifier>
+    	<format>bigWig</format>
+    	<quantity>single</quantity>
+    	<comment>Coverage track in bigWig format.</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.flagstat</identifier>
+    	<format>TXT</format>
+    	<quantity>single</quantity>
+    	<comment>The output from samtools flagstat</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.rawCov</identifier>
+    	<format>TXT</format>
+    	<quantity>single</quantity>
+    	<comment>Contains a single value, the average genomic coverage</comment>
+    </filetype>
+  </outputs>
+  <software>
+    <tool>
+    	<name>methylCtools</name>
+    	<version>0.9.2</version>
+    	<command_line><![CDATA[methylCtools faconv {ASSEMBLY}.fa {ASSEMBLY}.conv.fa ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Introduce C to T conversions to both strands. Only runs if the converted file does not exist</comment>
+    </tool>
+    <tool>
+    	<name>bwa</name>
+    	<version>0.7.12-r1039</version>
+    	<command_line><![CDATA[bwa index -a bwtsw {ASSEMBLY}.conv.fa]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>generate the bwa index file. This step only runs if the index does not exist</comment>
+    </tool>
+    <tool>
+      <name>Trim Galore!</name>
+      <version>0.3.3</version>
+      <command_line><![CDATA[trim_galore --rrbs -q 20 --phred33 -o sampleID_R1.trimmed.fq.gz --no_report_file sampleID_R1.fastq.gz]]></command_line>
+      <loop>sampleID_R1.fastq.gz</loop>
+      <comment>This step is a one to one process trimming all fastq files. The reads are filtered for the default adapter (AGATCGGAAGAGC) and quality below 20</comment>
+    </tool>
+    <tool>
+    	<name>methylCtools</name>
+    	<version>0.9.2</version>
+    	<command_line><![CDATA[methylCtools fqconv -1 sampleID_R1.trimmed.fq.gz sampleID_R1.conv.fq]]></command_line>
+    	<loop>sampleID_R1.trimmed.fq.gz</loop>
+    	<comment>A one to one process preparing the trimmed files for mapping by converting C to T and storing converted positions in the header</comment>
+    </tool>
+    <tool>
+    	<name>bwa</name>
+    	<version>0.7.12-r1039</version>
+    	<command_line><![CDATA[bwa aln -q 20 -t 7 {ASSEMBLY}.conv.fa sampleID_R1.conv.fq > PIPE1.sai]]></command_line>
+    	<loop>sampleID_R1.conv.fq</loop>
+    	<comment>A one to one process mapping each file. Again a quality cutoff at 20. This step is piped to the next.</comment>
+    </tool>
+    <tool>
+    	<name>bwa</name>
+    	<version>0.7.12-r1039</version>
+    	<command_line><![CDATA[bwa samse -n 0 {ASSEMBLY}.conv.fa PIPE1.sai sampleID_R1.conv.fq > PIPE2.sam]]></command_line>
+    	<loop>PIPE1.sai</loop>
+    	<comment>A one to one process converting each bwa alignment from sai to sam format</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[samtools view -Sbu PIPE2.sam > PIPE3.bam]]></command_line>
+    	<loop>PIPE2.sam</loop>
+    	<comment>A one to one process converting the alignment to bam format</comment>
+    </tool>
+    <tool>
+    	<name>methylCtools</name>
+    	<version>0.9.2</version>
+    	<command_line><![CDATA[methylCtools bconv --metrics /dev/null PIPE3.bam PIPE4.bam]]></command_line>
+    	<loop>PIPE3.bam</loop>
+    	<comment>A one to one process converting the reads in the alignment files back to their raw format, undoing the C to T conversion.</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[samtools view -h PIPE4.bam > PIPE5.sam]]></command_line>
+    	<loop>PIPE4.bam</loop>
+    	<comment>A one to one process reconverting to sam format in order to correct some peccularities introduced by bwa</comment>
+    </tool>
+    <tool>
+    	<name>awk</name>
+    	<version>4.0.1</version>
+    	<command_line><![CDATA[ awk -vFS='\t' -vOFS='\t' '$1!~/^@/ && and($2,4)==4 && $3!="*" {$2=4;$3="*";$4=0;$5=0;$6="*"} {print}' PIPE5.sam > PIPE6.sam ]]></command_line>
+    	<loop>PIPE5.sam</loop>
+    	<comment>A one to one process removing all mapping information present for unmapped reads. Sometimes bwa add this for unmapped reads.</comment>
+    </tool>
+    <tool>
+    	<name>Picardtools</name>
+    	<version>1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)</version>
+    	<command_line><![CDATA[ java -Xmx8G -jar AddOrReplaceReadGroups.jar MAX_RECORDS_IN_RAM=4000000 I=PIPE6.sam RGPL=ILLUMINA RGPU=flowcell RGID=sample_FLOWCELL_LANE RGLB=library RGSM={WGBS_INTERNAL_ID} SORT_ORDER=coordinate O=sampleID_R1.bam VALIDATION_STRINGENCY=LENIENT ]]></command_line>
+    	<loop>PIPE6.sam</loop>
+    	<comment>A one to one process adding reads to readgroups in accordance to FLOWCELL and LANE, which are replaced to the corresponding values.</comment>
+    </tool>
+    <tool>
+    	<name>Picardtools</name>
+    	<version>1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)</version>
+    	<command_line><![CDATA[java -jar MergeSamFiles.jar I=sampleID_R1.bam 0={OUTNAME}.bam]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Merges all the generated bam files. If multiple fastq files were used as input, I=sampleID_R1.bam has to be multiplied to point to all the generated bam files.</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools index {OUTNAME}.bam ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Generating the index file {OUTNAME}.bam.bai</comment>
+    </tool>
+
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[samtools view -H {OUTNAME.bam} > PIPE7.txt]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Extracting the header of the bam file in order to get chromosome lengths for the generation of the coverage file</comment>
+    </tool>
+    <tool>
+    	<name>awk</name>
+    	<version>4.0.1</version>
+    	<command_line><![CDATA[awk -vFS='[:\t]' -vOFS='\t' '$1="@SQ" && $2=="SN" {print $3,$5}' PIPE7.txt > ref.lengths]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Extracting the chromosome lengths from the sam header</comment>
+    </tool>
+    <tool>
+    	<name>bedtools</name>
+    	<version>v2.20.1</version>
+    	<command_line><![CDATA[ bedtools genomecov -ibam {OUTNAME}.bam -g ref.lengths -bg > coverage.bw.tmp ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Calculating base pair resolution coverage in bed graph format</comment>
+    </tool>
+    <tool>
+    	<name>bedGraphToBigWig</name>
+    	<version>v 4</version>
+    	<command_line><![CDATA[ bedGraphToBigWig coverage.bw.tmp ref.lengths {OUTNAME}.coverage.bw ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>converting the bedgraph file to bigWig format</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools view -u -F1280 {OUTNAME}.bam > PIPE8.bam ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Filtering non-primary (256) and multiple-mapping reads (1024) before calculating average coverage</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools mpileup -d 100000 PIPE8.bam > PIPE9.txt]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Converting to pileup format, limiting to regions with a coverage below 100000</comment>
+    </tool>
+    <tool>
+    	<name>awk</name>
+    	<version>4.0.1</version>
+    	<command_line><![CDATA[ awk '{c+=1; a+= ($4-a)/c} END{print a}' > {OUTNAME}.rawCov ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>calculating the average</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools flagstat {OUTNAME}.bam > {OUTNAME}.flagstat]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>generating the flagstat file</comment>
+    </tool>
+  </software>
+</process>