docs/interpretation/CSSv1.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
<process>
    <name>CSS</name>
	<version>1</version>
	<author>
		<name>Peter Ebert</name>
		<email>pebert@mpi-inf.mpg.de</email>
	</author>
    <description>
		This process file is in draft status.
        The CSS process is designed to generate basic chromatin state segmentation BED files based on histone modifications only.
        This process implements a dual strategy to produce the state segmentation: ChromHMM (by Jason Ernst) is used to generate a segmentation
        comparable to the reference segmentations provided by the ROADMAP project and thus forms the basis for the automated
        state labelling in this process. Additionally, EpiCSeg (by Alessandro Mammana) is used as a more sophisticated method for state
        segmentation. Since there is no reference state labelling available for EpiCSeg, the state labels for EpiCSeg are produced via a simple
        overlap analysis followed by a majority vote based on the labelled ChromHMM segmentation.
	</description>
	<inputs>
		<filetype>
			<identifier>GALvX_Histone</identifier>
			<format>BAM</format>
			<quantity>collection</quantity>
			<comment>To run this process as a default, all six marks need to be available for a sample</comment>
		</filetype>
        <filetype>
			<identifier>GALvX_Input</identifier>
			<format>BAM</format>
			<quantity>single</quantity>
			<comment></comment>
		</filetype>
	</inputs>
	<references>
		<filetype>
			<identifier>chrom_lengths</identifier>
			<format>TXT</format>
			<quantity>single</quantity>
			<comment>Common 2 column file listing chromosomes (name and length) for assembly</comment>
		</filetype>
        <filetype>
			<identifier>state_labels</identifier>
			<format>TXT</format>
			<quantity>single</quantity>
			<comment>ROADMAP reference state labels (18 states, 6 histone marks)</comment>
		</filetype>
		<filetype>
			<identifier>state_colors</identifier>
			<format>TXT</format>
			<quantity>single</quantity>
			<comment>ROADMAP reference state colors</comment>
		</filetype>
        <filetype>
			<identifier>blacklist_regions</identifier>
			<format>BED</format>
			<quantity>single</quantity>
			<comment>Common blacklist regions, same as for CHP or DHS</comment>
		</filetype>
        <filetype>
			<identifier>var_ref_files</identifier>
			<format>BED</format>
			<quantity>collection</quantity>
			<comment>It is still undecided if a fix set of annotation files should be used for the
            reporting feature of the tools (state overlap); due to the automated state labelling procedure,
            this is not necessary</comment>
		</filetype>
	</references>
	<outputs>
		<filetype>
			<identifier>state_segmentation</identifier>
			<format>BED</format>
			<quantity>collection</quantity>
			<comment>segmentations procduced by ChromHMM and EpiCSeg and post-processed to be properly labelled</comment>
		</filetype>
        <filetype>
			<identifier>model</identifier>
			<format>TXT</format>
			<quantity>collection</quantity>
			<comment>EpiCSeg model</comment>
		</filetype>
        <filetype>
			<identifier>reports</identifier>
			<format>tar.gz</format>
			<quantity>collection</quantity>
			<comment>combined set of all other output files that are part of the tool's report</comment>
		</filetype>
	</outputs>
	<software>
		<tool>
			<name>samtools</name>
			<version>1.2</version>
			<command_line><![CDATA[ samtools view -b -F 1024 {GALvX_*} > tmp_nodup.bam ]]></command_line>
            <loop>GALvX_Histone, GALvX_Input</loop>
			<comment>Remove duplicates</comment>
		</tool>
        <tool>
			<name>samtools</name>
			<version>1.2</version>
			<command_line><![CDATA[ samtools sort -n -T tmpsrt -O bam tmp_nodup.bam ]]></command_line>
            <loop>GALvX_Histone, GALvX_Input</loop>
			<comment>Sort prior to filtering blacklist, output is piped to next command</comment>
		</tool>
        <tool>
			<name>bedtools</name>
			<version>2.20.1</version>
			<command_line><![CDATA[ pairToBed -ubam -type neither -abam - -b {blacklist_regions} ]]></command_line>
            <loop></loop>
			<comment>Filter blacklist regions, output is piped to next command</comment>
		</tool>
        <tool>
			<name>samtools</name>
			<version>1.2</version>
			<command_line><![CDATA[ samtools sort -o tmp_nodup_blfilt.bam -T tmpsrt2 -O bam - && samtools index tmp_nodup_blfilt.bam ]]></command_line>
            <loop></loop>
			<comment>Create filtered and indexed BAM file</comment>
		</tool>
        <tool>
			<name>bedtools</name>
			<version>2.20.1</version>
			<command_line><![CDATA[ bedtools bamtobed -i tmp_nodup_blfilt.bam | egrep "^(chr)?[0-9XY]+\s" > tmp_nodup_blfilt.bed ]]></command_line>
            <loop></loop>
			<comment>Make simple BED file as input for ChromHMM</comment>
		</tool>
        <tool>
            <name>java, ChromHMM.jar</name>
            <version>1.7.0_65, 1.10</version>
            <command_line><![CDATA[ ChromHMM.jar BinarizeBed {chrom_lengths} wrk_tmp_dir cellmarktable wrk_tmp_dir ]]></command_line>
            <loop></loop>
			<comment>The cellmarktable info file is built on-the-fly by the pipeline and discarded after the run</comment>
        </tool>
		<tool>
            <name>java, ChromHMM.jar</name>
            <version>1.7.0_65, 1.10</version>
            <command_line><![CDATA[ ChromHMM.jar MakeSegmentation -printposterior -b {bin_size} {remc_model} wrk_tmp_dir wrk_out_dir ]]></command_line>
            <loop></loop>
			<comment>The cellmarktable info file is built on-the-fly by the pipeline and discarded after the run</comment>
        </tool>
        <tool>
            <name>R, EpiCSeg</name>
            <version>3.2.0, 2016-04-04</version>
            <command_line><![CDATA[ Rscript epicseg.R getcounts --nthreads {ecs_parallel} --pairedend TRUE --regions tmp_regions --target {DEEPID.counts} -m tmp_nodup_blfilt.bam ]]></command_line>
            <loop>no looping</loop>
			<comment>The command is assembled via repeating the -m parameter (-m Mark:Filepath) for the marks/Input</comment>
        </tool>
        <tool>
            <name>R, EpiCSeg</name>
            <version>3.2.0, 2016-04-04</version>
            <command_line><![CDATA[ Rscript epicseg.R normalizecounts --nthreads {ecs_parallel} -c DEEPID.counts ]]></command_line>
            <loop>no looping</loop>
			<comment>This normalization step is mandatory if a joined segmentation is done for several samples (e.g. for a whole sub-project);
            in the default case of a single sample, this has no relevant effect</comment>
        </tool>
        <tool>
            <name>R, EpiCSeg</name>
            <version>3.2.0, 2016-04-04</version>
            <command_line><![CDATA[ Rscript epicseg.R segment --nthreads {ecs_parallel} --nstates 18 --outdir wrk_out_dir --regions tmp_regions -c DEEPID.counts.norm ]]></command_line>
            <loop>no looping</loop>
			<comment></comment>
        </tool>
		<tool>
            <name>bedtools</name>
            <version>2.20.1</version>
            <command_line><![CDATA[ bedtools intersect -wo -a {ecs_segmentation} -b {cmm_segmentation}} > ecs_cmm_isect.tmp ]]></command_line>
            <loop>EpiCSeg and ChromHMM segmentations</loop>
			<comment></comment>
        </tool>
        <tool>
            <name>Python3</name>
            <version>3.2.3</version>
            <command_line><![CDATA[ process_css.py --task corres -i ecs_cmm_isect.tmp -o ecs_cmm_state_corres.tsv]]></command_line>
            <loop>no looping</loop>
			<comment>Assign labels for the EpiCSeg segmentation based on overlap with ChromHMM segmentation. This command outputs a label mapping file</comment>
        </tool>
        <tool>
            <name>Python3</name>
            <version>3.2.3</version>
            <command_line><![CDATA[ process_css.py --task track -i ecs_segment.tmp -o {state_segmentation} --state-corres ecs_cmm_state_corres.tsv --track-name "SAMPLEID (ECS)" --track-desc "SAMPLEID CSSvX EpiCSeg"]]></command_line>
            <loop>no looping</loop>
			<comment>Assign labels for the EpiCSeg segmentation based on overlap with ChromHMM segmentation. Produces a browsertrack BED file to be loaded into, e.g., IGV</comment>
        </tool>
		<tool>
            <name>Python3</name>
            <version>3.2.3</version>
            <command_line><![CDATA[ process_css.py --task track -i cmm_segment.tmp -o {state_segmentation} --color-map {state_colors} --label-map {state_labels} --track-name "SAMPLEID (CMM)" --track-desc "SAMPLEID CSSvX ChromHMM"]]></command_line>
            <loop>no looping</loop>
			<comment>Assign labels for the EpiCSeg segmentation based on overlap with ChromHMM segmentation. Produces a browsertrack BED file to be loaded into, e.g., IGV</comment>
        </tool>
        <tool>
            <name>Unspec</name>
            <version>0.0</version>
            <command_line><![CDATA[ package model outputs ]]></command_line>
            <loop>no looping</loop>
			<comment>To do command for packaging the various output files into 2 gzip files</comment>
        </tool>
	</software>
</process>