Skip to content

Commit

Permalink
FIX: reworked most of the process file to make it validate against th…
Browse files Browse the repository at this point in the history
…e schema - still some issues like lacking description and, apparently, unnecessary documentation of all intermediate/temp files
  • Loading branch information
pebert committed Dec 29, 2016
1 parent 6045b58 commit afc6e25
Showing 1 changed file with 237 additions and 0 deletions.
237 changes: 237 additions & 0 deletions docs/alignment/bisulfite/BALv1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
<process>
<name>BAL</name>
<version>1</version>
<author>
<name>Charles Imbusch</name>
<email>c.imbusch@dkfz.de</email>
</author>
<!-- Precise description of what this process does, what output is generated and what statistics are computed -->
<description>
* trimming, in silico conversion of reads, mapping, re-converting reads, flagstats, QC after mapping
</description>
<!-- Following section: list input files [samples to be analysed and similar] -->
<inputs>
<filetype>
<identifier>SampleID_R1</identifier>
<format>FASTQ</format>
<quantity>single</quantity>
<comment>raw input file, pre-filtered for Illumina chastity filter failed reads</comment>
</filetype>
<filetype>
<identifier>SampleID_R2</identifier>
<format>FASTQ</format>
<quantity>single</quantity>
<comment>raw input file, pre-filtered for Illumina chastity filter failed reads</comment>
</filetype>
</inputs>

<!-- Following section: list reference files [e.g. reference genomes] used in this process -->
<references>
<filetype>
<identifier>reference_genome</identifier>
<format>FASTA</format>
<quantity>single</quantity>
<comment>The in silico bisulfite converted reference genome file</comment>
</filetype>
<filetype>
<identifier>reference_genome.pos</identifier>
<format>text</format>
<quantity>single</quantity>
<comment>CG/GH positions in the reference genome created by 'methylCtools fapos'</comment>
</filetype>
</references>

<!-- Following section: list output files of process [e.g. bed files, wiggle tracks] -->
<outputs>
<filetype>
<identifier>DEEPID.PROC.DATE.aln</identifier>
<format>BAM</format>
<quantity>single</quantity>
<comment>Final Bam file with reconverted Cs</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.idx</identifier>
<format>BAI</format>
<quantity>single</quantity>
<comment>Corresponding BAM index file</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.R1.aln</identifier>
<format>SAI</format>
<quantity>single</quantity>
<comment>aligned reads, output from bwa</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.R2.aln</identifier>
<format>SAI</format>
<quantity>single</quantity>
<comment>aligned reads, output from bwa</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.R1.trim</identifier>
<format>FASTQ</format>
<quantity>single</quantity>
<comment>Adaptor trimmed fastq file</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.R2.trim</identifier>
<format>FASTQ</format>
<quantity>single</quantity>
<comment>Adaptor trimmed fastq file</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.R1.conv</identifier>
<format>FASTQ</format>
<quantity>single</quantity>
<comment>In silico converted fastq file</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.R2.conv</identifier>
<format>FASTQ</format>
<quantity>single</quantity>
<comment>In silico converted fastq file</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.flagstats</identifier>
<format>text</format>
<quantity>single</quantity>
<comment>samtools flagstat</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.PicardMarkDupmetrics</identifier>
<format>text</format>
<quantity>single</quantity>
<comment></comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.PicardInsertSizemetrics</identifier>
<format>text</format>
<quantity>single</quantity>
<comment></comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.CHROM_CG_CH.mcall</identifier>
<format>methylation calls</format>
<quantity>collection</quantity>
<comment>The methylation calls separated per chromosome</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.CHROM_CG_CH.mcall.tbi</identifier>
<format>tabix index</format>
<quantity>collection</quantity>
<comment>The tabix index for the methylation calls of each chromosome</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.CHROM_CG_CH.mcall.metrics</identifier>
<format>text file</format>
<quantity>collection</quantity>
<comment>Metrics for methylation calls of each chromosome</comment>
</filetype>
<filetype>
<identifier>DEEPID.PROC.DATE.reconversion.metrics</identifier>
<format>text</format>
<quantity>single</quantity>
<comment>Metrics on re-conversion</comment>
</filetype>
</outputs>

<software>
<tool>
<name>seqprep</name>
<version>0.4</version>
<command_line><![CDATA[ SeqPrep AGATCGGAAGAGCGGTTCAG -f {SampleID_R1} -r {SampleID_R2} -1 {DEEPID.PROC.DATE.R1.trim} -2 {DEEPID.PROC.DATE.R2.trim} ]]></command_line>
<loop>SampleID_R*</loop>
<comment>trim reads by default adaptor</comment>
</tool>
<tool>
<name>methylCtools</name>
<version>0.9.2</version>
<command_line><![CDATA[ methylCtools fqconv {DEEPID.PROC.DATE.R*.trim} {DEEPID.PROC.DATE.R*.conv} ]]></command_line>
<loop>DEEPID.PROC.DATE.R*.trim</loop>
<comment>bisulfite convert reads in silico</comment>
</tool>
<tool>
<name>bwa</name>
<version>cnybwa-0.6.2</version>
<command_line><![CDATA[ cnybwa-0.6.2 aln -q 20 -t 8 -I {reference_genome} {DEEPID.PROC.DATE.R*.conv} > {DEEPID.PROC.DATE.R*.aln} ]]></command_line>
<loop>DEEPID.PROC.DATE.R*.conv</loop>
<comment>produce two intermediate .sai files per lane, performed on convey machines</comment>
</tool>
<tool>
<name>bwa</name>
<version>0.6.2-tpx</version>
<command_line>
<![CDATA[
bwa sampe -t 8 -T -s -P -n 0 -N 0 -r "@RG\tID:${ID}\tSM:${SM}\tLB:${LB}\tPL:ILLUMINA"
{reference_genome} {DEEPID.PROC.DATE.R1.aln} {DEEPID.PROC.DATE.R2.aln} {DEEPID.PROC.DATE.R1.conv}
{DEEPID.PROC.DATE.R2.conv} > sampe_output
]]>
</command_line>
<loop>DEEPID.PROC.DATE.R*.aln</loop>
<comment>pairing of reads to SAM format; output is piped to next step</comment>
</tool>
<tool>
<name>methylCtools</name>
<version>0.9.2</version>
<command_line><![CDATA[ methylCtools bconv --metrics {DEEPID.PROC.DATE.reconversion.metrics} sampe_output methylCtools_reconverted.bam ]]></command_line>
<loop>sampe_output</loop>
<comment>Input piped from previous step; reconversion step of methylCtools</comment>
</tool>
<tool>
<name>samtools</name>
<version>0.1.19</version>
<command_line><![CDATA[ samtools view -Sbu methylCtools_reconverted.bam | samtools sort -o - sorted_lane_bamfile ]]></command_line>
<loop>no looping</loop>
<comment>Sorting BAM by coordinate</comment>
</tool>
<tool>
<name>Picard</name>
<version>1.61</version>
<command_line>
<![CDATA[
java -Xmx50G picard-1.61.jar MarkDuplicates I=sorted_lane_bamfile OUTPUT={DEEPID.PROC.DATE.aln}
TMP_DIR={TMP_DIR} VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATES=FALSE ASSUME_SORTED=TRUE CREATE_INDEX=TRUE
MAX_RECORDS_IN_RAM=12500000 METRICS_FILE={DEEPID.PROC.DATE.PicardMarkDupmetrics}
]]>
</command_line>
<loop>no looping</loop>
<comment>Merging lanes, marking duplicates and index creation. The Picard commandline gets I=bamfile for each bam file as input which is simplified above in the command line.</comment>
</tool>
<tool>
<name>samtools</name>
<version>0.1.19</version>
<command_line><![CDATA[ samtools flagstat {DEEPID.PROC.DATE.aln} > {DEEPID.PROC.DATE.flagstats} ]]></command_line>
<loop>no looping</loop>
<comment></comment>
</tool>
<tool>
<name>Picard</name>
<version>1.61</version>
<command_line>
<![CDATA[
java -Xmx4G picard-1.61.jar CollectMultipleMetrics INPUT={DEEPID.PROC.DATE.aln}
REFERENCE_SEQUENCE={reference_genome} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT
OUTPUT={DEEPID.PROC.DATE.PicardInsertSizemetrics} PROGRAM=CollectAlignmentSummaryMetrics
PROGRAM=CollectInsertSizeMetrics PROGRAM=QualityScoreDistribution PROGRAM=MeanQualityByCycle
]]>
</command_line>
<loop>no looping</loop>
<comment>creates several output files</comment>
</tool>
<tool>
<name>methylCtools</name>
<version>0.9.2</version>
<command_line>
<![CDATA[
python bcall.py -r {CHROM} --trimPE --snv --zero -e 5 {reference_genome.pos}
-m {DEEPID.PROC.DATE.CHROM_CG_CH.mcall.metrics} {DEEPID.PROC.DATE.aln}
{DEEPID.PROC.DATE.CHROM_CG_CH.mcall}
]]>
</command_line>
<loop>CHROM</loop>
<comment>Creates methylation calls for each chromosome</comment>
</tool>
</software>
</process>

0 comments on commit afc6e25

Please sign in to comment.