From bd70b14e96d6e9e02fe98938204bb77d0a6e1920 Mon Sep 17 00:00:00 2001
From: Peter Ebert <pebert@mpi-inf.mpg.de>
Date: Fri, 30 Dec 2016 15:16:43 +0100
Subject: [PATCH] ADD: RBA and GAL; syntactically valid, some minor
 content-related issues

---
 docs/alignment/bisulfite/RBAv0.xml | 219 +++++++++++++++++++++++++++++
 docs/alignment/genome/GALv1.xml    | 211 +++++++++++++++++++++++++++
 2 files changed, 430 insertions(+)
 create mode 100644 docs/alignment/bisulfite/RBAv0.xml
 create mode 100644 docs/alignment/genome/GALv1.xml
diff --git a/docs/alignment/bisulfite/RBAv0.xml b/docs/alignment/bisulfite/RBAv0.xml
new file mode 100644
index 0000000..a0f53df
--- /dev/null
+++ b/docs/alignment/bisulfite/RBAv0.xml
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
+<process>
+  <name>RBA</name>
+  <version>0</version>
+  <author>
+    <name>Karl Nordström, Charles Imbusch</name>
+    <email>karl.nordstroem@uni-saarland.de</email>
+  </author>
+  <description>
+  The RBAv0 pipeline is a cloned version of the DEEP BAL process. It trims and aligns RRBS data to a reference genome.
+  
+  0. Generation of MethylCtools reference index
+  1. trim reads with Trim Galore! (Cutadapt)
+  2. Map reads with MethylCtools (BWA)
+  3. Merge bam files with Picard tools
+  4. Generate a flagstat file
+  
+  Step 0 is run manually and only once.
+  </description>
+  <inputs>
+    <filetype>
+      <identifier>sampleID_R1.fastq.gz</identifier>
+      <format>FASTQ</format>
+      <quantity>collection</quantity>
+      <comment>The current implementation takes a folder as input and trims and maps all fastq files in the folder</comment>
+    </filetype>
+  </inputs>
+  <references>
+    <filetype>
+      <identifier>{ASSEMBLY}.fa</identifier>
+      <format>FASTA</format>
+      <quantity>single</quantity>
+      <comment>fasta file containing genomic reference sequence</comment>
+    </filetype>
+  </references>
+  <outputs>
+    <filetype>
+      <identifier>{OUTNAME}.bam</identifier>
+      <format>BAM</format>
+      <quantity>single</quantity>
+      <comment>The resulting alignment in BAM format</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.bam.bai</identifier>
+    	<format>BAI</format>
+    	<quantity>single</quantity>
+    	<comment>Index file for the alignment</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.coverage.bw</identifier>
+    	<format>bigWig</format>
+    	<quantity>single</quantity>
+    	<comment>Coverage track in bigWig format.</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.flagstat</identifier>
+    	<format>TXT</format>
+    	<quantity>single</quantity>
+    	<comment>The output from samtools flagstat</comment>
+    </filetype>
+    <filetype>
+    	<identifier>{OUTNAME}.rawCov</identifier>
+    	<format>TXT</format>
+    	<quantity>single</quantity>
+    	<comment>Contains a single value, the average genomic coverage</comment>
+    </filetype>
+  </outputs>
+  <software>
+    <tool>
+    	<name>methylCtools</name>
+    	<version>0.9.2</version>
+    	<command_line><![CDATA[methylCtools faconv {ASSEMBLY}.fa {ASSEMBLY}.conv.fa ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Introduce C to T conversions to both strands. Only runs if the converted file does not exist</comment>
+    </tool>
+    <tool>
+    	<name>bwa</name>
+    	<version>0.7.12-r1039</version>
+    	<command_line><![CDATA[bwa index -a bwtsw {ASSEMBLY}.conv.fa]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>generate the bwa index file. This step only runs if the index does not exist</comment>
+    </tool>
+    <tool>
+      <name>Trim Galore!</name>
+      <version>0.3.3</version>
+      <command_line><![CDATA[trim_galore --rrbs -q 20 --phred33 -o sampleID_R1.trimmed.fq.gz --no_report_file sampleID_R1.fastq.gz]]></command_line>
+      <loop>sampleID_R1.fastq.gz</loop>
+      <comment>This step is a one to one process trimming all fastq files. The reads are filtered for the default adapter (AGATCGGAAGAGC) and quality below 20</comment>
+    </tool>
+    <tool>
+    	<name>methylCtools</name>
+    	<version>0.9.2</version>
+    	<command_line><![CDATA[methylCtools fqconv -1 sampleID_R1.trimmed.fq.gz sampleID_R1.conv.fq]]></command_line>
+    	<loop>sampleID_R1.trimmed.fq.gz</loop>
+    	<comment>A one to one process preparing the trimmed files for mapping by converting C to T and storing converted positions in the header</comment>
+    </tool>
+    <tool>
+    	<name>bwa</name>
+    	<version>0.7.12-r1039</version>
+    	<command_line><![CDATA[bwa aln -q 20 -t 7 {ASSEMBLY}.conv.fa sampleID_R1.conv.fq > PIPE1.sai]]></command_line>
+    	<loop>sampleID_R1.conv.fq</loop>
+    	<comment>A one to one process mapping each file. Again a quality cutoff at 20. This step is piped to the next.</comment>
+    </tool>
+    <tool>
+    	<name>bwa</name>
+    	<version>0.7.12-r1039</version>
+    	<command_line><![CDATA[bwa samse -n 0 {ASSEMBLY}.conv.fa PIPE1.sai sampleID_R1.conv.fq > PIPE2.sam]]></command_line>
+    	<loop>PIPE1.sai</loop>
+    	<comment>A one to one process converting each bwa alignment from sai to sam format</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[samtools view -Sbu PIPE2.sam > PIPE3.bam]]></command_line>
+    	<loop>PIPE2.sam</loop>
+    	<comment>A one to one process converting the alignment to bam format</comment>
+    </tool>
+    <tool>
+    	<name>methylCtools</name>
+    	<version>0.9.2</version>
+    	<command_line><![CDATA[methylCtools bconv --metrics /dev/null PIPE3.bam PIPE4.bam]]></command_line>
+    	<loop>PIPE3.bam</loop>
+    	<comment>A one to one process converting the reads in the alignment files back to their raw format, undoing the C to T conversion.</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[samtools view -h PIPE4.bam > PIPE5.sam]]></command_line>
+    	<loop>PIPE4.bam</loop>
+    	<comment>A one to one process reconverting to sam format in order to correct some peccularities introduced by bwa</comment>
+    </tool>
+    <tool>
+    	<name>awk</name>
+    	<version>4.0.1</version>
+    	<command_line><![CDATA[ awk -vFS='\t' -vOFS='\t' '$1!~/^@/ && and($2,4)==4 && $3!="*" {$2=4;$3="*";$4=0;$5=0;$6="*"} {print}' PIPE5.sam > PIPE6.sam ]]></command_line>
+    	<loop>PIPE5.sam</loop>
+    	<comment>A one to one process removing all mapping information present for unmapped reads. Sometimes bwa add this for unmapped reads.</comment>
+    </tool>
+    <tool>
+    	<name>Picardtools</name>
+    	<version>1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)</version>
+    	<command_line><![CDATA[ java -Xmx8G -jar AddOrReplaceReadGroups.jar MAX_RECORDS_IN_RAM=4000000 I=PIPE6.sam RGPL=ILLUMINA RGPU=flowcell RGID=sample_FLOWCELL_LANE RGLB=library RGSM={WGBS_INTERNAL_ID} SORT_ORDER=coordinate O=sampleID_R1.bam VALIDATION_STRINGENCY=LENIENT ]]></command_line>
+    	<loop>PIPE6.sam</loop>
+    	<comment>A one to one process adding reads to readgroups in accordance to FLOWCELL and LANE, which are replaced to the corresponding values.</comment>
+    </tool>
+    <tool>
+    	<name>Picardtools</name>
+    	<version>1.115(30b1e546cc4dd80c918e151dbfe46b061e63f315_1402927010)</version>
+    	<command_line><![CDATA[java -jar MergeSamFiles.jar I=sampleID_R1.bam 0={OUTNAME}.bam]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Merges all the generated bam files. If multiple fastq files were used as input, I=sampleID_R1.bam has to be multiplied to point to all the generated bam files.</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools index {OUTNAME}.bam ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Generating the index file {OUTNAME}.bam.bai</comment>
+    </tool>
+    
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[samtools view -H {OUTNAME.bam} > PIPE7.txt]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Extracting the header of the bam file in order to get chromosome lengths for the generation of the coverage file</comment>
+    </tool>
+    <tool>
+    	<name>awk</name>
+    	<version>4.0.1</version>
+    	<command_line><![CDATA[awk -vFS='[:\t]' -vOFS='\t' '$1="@SQ" && $2=="SN" {print $3,$5}' PIPE7.txt > ref.lengths]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Extracting the chromosome lengths from the sam header</comment>
+    </tool>
+    <tool>
+    	<name>bedtools</name>
+    	<version>v2.20.1</version>
+    	<command_line><![CDATA[ bedtools genomecov -ibam {OUTNAME}.bam -g ref.lengths -bg > coverage.bw.tmp ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Calculating base pair resolution coverage in bed graph format</comment>
+    </tool>
+    <tool>
+    	<name>bedGraphToBigWig</name>
+    	<version>v 4</version>
+    	<command_line><![CDATA[ bedGraphToBigWig coverage.bw.tmp ref.lengths {OUTNAME}.coverage.bw ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>converting the bedgraph file to bigWig format</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools view -u -F1280 {OUTNAME}.bam > PIPE8.bam ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Filtering non-primary (256) and multiple-mapping reads (1024) before calculating average coverage</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools mpileup -d 100000 PIPE8.bam > PIPE9.txt]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>Converting to pileup format, limiting to regions with a coverage below 100000</comment>
+    </tool>
+    <tool>
+    	<name>awk</name>
+    	<version>4.0.1</version>
+    	<command_line><![CDATA[ awk '{c+=1; a+= ($4-a)/c} END{print a}' > {OUTNAME}.rawCov ]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>calculating the average</comment>
+    </tool>
+    <tool>
+    	<name>samtools</name>
+    	<version>1.2 (using htslib 1.2.1)</version>
+    	<command_line><![CDATA[ samtools flagstat {OUTNAME}.bam > {OUTNAME}.flagstat]]></command_line>
+    	<loop>no looping</loop>
+    	<comment>generating the flagstat file</comment>
+    </tool>
+  </software>
+</process>
diff --git a/docs/alignment/genome/GALv1.xml b/docs/alignment/genome/GALv1.xml
new file mode 100644
index 0000000..bef8a57
--- /dev/null
+++ b/docs/alignment/genome/GALv1.xml
@@ -0,0 +1,211 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
+<process>
+	<name>GAL</name>
+	<version>1</version>
+	<author>
+		<name>Barbara Hutter</name>
+		<email>b.hutter@dkfz.de</email>
+	</author>
+	<!-- Following section: free text description of process (what, how, why) -->
+	<description>
+		* mapping of raw sequences to the reference genome
+		- N pairs of fastq files that are processed into bam files separately and merged into one at the end
+	</description>
+	<!-- Following section: list input files [samples to be analysed and similar] -->
+	<inputs>
+		<filetype>
+			<identifier>SampleID_R1</identifier>
+			<format>FASTQ</format>
+			<quantity>collection</quantity>
+			<comment>raw input file with forward read of the pair ("read1"), pre-filtered for illumina chastity filter failed reads</comment>
+		</filetype>
+		<filetype>
+			<identifier>SampleID_R2</identifier>
+			<format>FASTQ</format>
+			<quantity>collection</quantity>
+			<comment>raw input file with reverse read of the pair ("read2"), pre-filtered for illumina chastity filter failed reads</comment>
+		</filetype>
+	</inputs>
+	<references>
+		<filetype>
+			<identifier>reference_genome</identifier>
+			<format>FASTA</format>
+			<quantity>single</quantity>
+			<comment>The reference genome file; see aspera.dkfz.de > download > results > references > genomes > human/mouse > WholeGenome</comment>
+		</filetype>
+	</references>
+	<!-- Following section: list input files [samples to be analysed and similar] -->
+	<outputs>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.bam</identifier>
+			<format>BAM</format>
+			<quantity>single</quantity>
+			<comment>the bam file merged from all input fastq files, duplicates are marked</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.bai</identifier>
+			<format>BAI</format>
+			<quantity>single</quantity>
+			<comment>Corresponding BAM index file, produced during merging and duplicate marking</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.flagstats</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>simple alignment statistics of the merged, duplicate marked bam</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.QcSummary</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>A summary of aligment statistics such as number of reads, percent of aligned reads, coverage of the genome, duplication level, etc.</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardMarkDupmetrics</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>Produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardAlignmentSummarymetrics</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardInsertSizemetrics</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardQualityByCyclemetrics</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardQualityDistributionmetrics</identifier>
+			<format>text</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardInsertSizeHistogram</identifier>
+			<format>PDF</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardQualityByCyclemetrics</identifier>
+			<format>PDF</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.PicardQualityDistributionmetrics</identifier>
+			<format>PDF</format>
+			<quantity>single</quantity>
+			<comment>produced by Picard CollectMultipleMetrics</comment>
+		</filetype>
+	</outputs>
+	<software>
+		<tool>
+			<name>bwa</name>
+			<version>cnybwa-0.6.2</version>
+			<command_line><![CDATA[ cnybwa-0.6.2 aln -t 12 -q 20 {reference_genome} {SampleID_R*} > sampleID_R*.sai ]]></command_line>
+            <loop>SampleID_R*</loop>
+			<comment>production of an intermediate .sai file for each read1 and read2 fastq file, performed on convey machines. cnybwa-0.6.2 is a hardware re-implementation of bwa version 0.6.2. t is the number of threads, -q the parameter for iterative quality trimming of the read down to 35 bp</comment>
+		</tool>
+		<tool>
+			<name>bwa</name>
+			<version>0.6.2-tpx</version>
+			<command_line>
+				<![CDATA[
+					bwa sampe -P -a 1000 -T -t 8 -r readgroupinformation {reference_genome}
+					sampleID_R1.sai sampleID_R2.sai {SampleID_R1} {SampleID_R2} > sampleID_Sampe_output
+				]]>
+			</command_line>
+            <loop>SampleID_R*</loop>
+			<comment>
+				Pairing of reads to SAM format, piped to next step (samtools view).
+				Parameters: -a to set maximum insert size to 1000 bp, -t number of threads, -P pre-load index, -T use original buffer size.
+				The parameter readgroupinformation is initialized in the script as "@RG\tID:$ID\tSM:$SM\tLB:$LB\tPL:ILLUMINA",
+				where $ID is composed of run and lane (e.g. run140918_SN7001180_0145_C451VACXX_44_Mm08_WEAd_Db1_H3K9me3_F_1_ACAGTG_L001),
+				$SM the sampletype (e.g. sample_replicate1-H3K9me3_44_Mm08_WEAd_Db1),
+				and $LB the library (e.g. replicate1-H3K9me3_44_Mm08_WEAd_Db1).
+				These variables are constructed according to the file path of the fastq files.
+			</comment>
+		</tool>
+		<tool>
+			<name>samtools</name>
+			<version>0.1.19</version>
+			<command_line><![CDATA[ cat sampleID_Sampe_output | samtools view -uSbh - | samtools sort -o - > bamfile ]]></command_line>
+            <loop>sampleID_Sampe_output</loop>
+			<comment>Input piped from previous step (bwa sampe), conversion of SAM to BAM and sorting by coordinate</comment>
+		</tool>
+		<tool>
+			<name>Picard</name>
+			<version>1.125</version>
+			<command_line>
+				<![CDATA[
+					java8 -Xmx50G -jar picard-tools-1.125.jar MarkDuplicates I=bamfile*
+					OUTPUT={DEEPID.PROC.DATE.bam} VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATES=FALSE
+					ASSUME_SORTED=TRUE CREATE_INDEX=TRUE MAX_RECORDS_IN_RAM=12500000
+					METRICS_FILE={DEEPID.PROC.DATE.PicardMarkDupmetrics}
+				]]>
+			</command_line>
+            <loop>no looping</loop>
+			<comment>
+				Merging of per-lane bam files, marking of duplicates and index creation {DEEPID.PROC.DATE.bai}.
+				The Picard commandline gets I=bamfile for each bam file as input, which is simplified above in the command line.
+				Was previously version 1.61, from January 2015 on version 1.125
+			</comment>
+		</tool>
+		<tool>
+			<name>samtools</name>
+			<version>0.1.19</version>
+			<command_line><![CDATA[ samtools flagstat {DEEPID.PROC.DATE.bam} > {DEEPID.PROC.DATE.flagstats} ]]></command_line>
+            <loop>no looping</loop>
+			<comment>simple alignment statistics of the merged, duplicate marked bam</comment>
+		</tool>
+	
+		<tool>
+			<name>Picard</name>
+			<version>1.61</version>
+			<command_line>
+				<![CDATA[
+					java -Xmx4G -cp picard-tools-1.61.jar -jar CollectMultipleMetrics.jar INPUT={DEEPID.PROC.DATE.bam}
+					REFERENCE_SEQUENCE={reference_genome} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT
+					OUTPUT={DEEPID.PROC.DATE.Picard*} PROGRAM=CollectAlignmentSummaryMetrics
+					PROGRAM=CollectInsertSizeMetrics PROGRAM=QualityScoreDistribution PROGRAM=MeanQualityByCycle
+				]]>
+			</command_line>
+            <loop>no looping</loop>
+			<comment>
+				creates several output files:
+				DEEPID.PROC.DATE.PicardAlignmentSummarymetrics
+				DEEPID.PROC.DATE.PicardInsertSizemetrics
+				DEEPID.PROC.DATE.PicardQualityByCyclemetrics
+				DEEPID.PROC.DATE.PicardQualityDistributionmetrics
+				DEEPID.PROC.DATE.PicardQualityByCyclemetrics
+				DEEPID.PROC.DATE.PicardQualityDistributionmetrics
+			</comment>
+		</tool>
+		<tool>
+			<name>QCsummary</name>
+			<version>n/a</version>
+			<command_line>
+				<![CDATA[
+					perl writeQCsummary.pl -c samplesID.coverage.txt -d samplesID.diffchrom.txt
+					-f {DEEPID.PROC.DATE.flagstats} -i samplesID.insertsize.txt
+					-m {DEEPID.PROC.DATE.PicardMarkDupmetrics} -l "genome" -r "all_merged"
+					-p sampleID -s sampletype > {DEEPID.PROC.DATE.QcSummary}
+				]]>
+			</command_line>
+            <loop>no looping</loop>
+			<comment>a custom perl script that also reads in files that are not relevant for DEEP since it is part of the DKFZ whole genome pipeline.</comment>
+		</tool>
+	</software>
+</process>