THBv2.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
<process>
    <name>THB</name>
	<version>2</version>
	<author>
		<name>Peter Ebert</name>
		<email>pebert@mpi-inf.mpg.de</email>
	</author>
    <description>
		This process merely describes the conversion - not production - of DEEP data files into an IHEC compatible format.
        If you have any questions about the actual data, please refer to the process XML files describing
        the data production and contact the author named in the respective file. The trackhub conversion process
        describes the conversion of standardized DEEP process output files into one of the BIG formats
        needed to submit the data as IHEC track hub. Since the reference assemblies used by IHEC are different
        to the ones used by DEEP, the conversion consists of the following steps:
        (i) filter data files for chromosomes 1-22 (hsa)/1-19 (mmu) and X,
        (ii) add &quot;chr&quot; prefix to chromosome names and
        (iii) for all BED or BED-like files, ensure that these represent a regular BED6+ file; in particular,
        the &quot;score&quot; column is adjusted by default to be in the range 0-1000 (for details about the
        formats used, please refer to https://genome.ucsc.edu/FAQ/FAQformat.html).
        The adjustment works as follows:
        select one meaningful column (e.g. coverage, signal enrichment or similar), bin the data according
        to the gray shading schema used by the UCSC genome browser (see link above) and then assign fix score
        values according according to the binning.
	</description>
	<inputs>
		<filetype>
			<identifier>DEEP_bigwig</identifier>
			<format>bigWig</format>
			<quantity>collection</quantity>
			<comment>bigWig output of a standardized DEEP process (libraries: histone, DNase, NOMe, WGBS; only raw/unfiltered signal tracks for histone and DNase libs)</comment>
		</filetype>
        <filetype>
            <identifier>DEEP_bed</identifier>
            <format>BED or BED-like</format>
            <quantity>collection</quantity>
            <comment>BED or BED-like output of a standardized DEEP process; comprises of histone, DNase and NOMe peaks and expressed small/long RNAs</comment>
        </filetype>
    </inputs>
	<references>
		<filetype>
			<identifier>chrom_sizes</identifier>
			<format>table</format>
			<quantity>collection</quantity>
			<comment>Common files containing information about the chromosome sizes for the respective assemblies</comment>
		</filetype>
		<filetype>
			<identifier>field_names</identifier>
			<format>AutoSQL</format>
			<quantity>collection</quantity>
			<comment>AutoSQL files describing the different BED files: narrowPeak, broadPeak, gNOMePeak, snRNAexpr, longRNAexpr</comment>
		</filetype>
	</references>
	<outputs>
		<filetype>
			<identifier>DEEPID.PROC.DATE.bigBed</identifier>
			<format>bigBed</format>
			<quantity>collection</quantity>
			<comment>Converted BED or BED-like files</comment>
		</filetype>
		<filetype>
			<identifier>DEEPID.PROC.DATE.bigWig</identifier>
			<format>bigWig</format>
			<quantity>collection</quantity>
			<comment>Converted bigWig files</comment>
		</filetype>
	</outputs>
	<software>
		<tool>
			<name>bigWigToBedGraph, egrep, sort, sed</name>
			<version>4, 2.12, 8.13, 4.2.1</version>
			<command_line>
                <![CDATA[ bigWigToBedGraph {DEEP_bigwig} stdout | egrep "^[0-9X]+\s" | sort -V -k 1,2 | sed 's/^/chr/' > temp_signal.bg ]]>
            </command_line>
            <loop>DEEP_bigwig</loop>
			<comment>Filter all signal tracks and add prefix, make sure that output is sorted (should be by construction)</comment>
		</tool>
        <tool>
            <name>bedGraphToBigWig</name>
			<version>4</version>
			<command_line>
                <![CDATA[ bedGraphToBigWig temp_signal.bg {chrom_sizes} {DEEPID.PROC.DATE.bigWig} ]]>
            </command_line>
            <loop>temp_signal.bg</loop>
			<comment>Create final signal tracks</comment>
        </tool>
        <tool>
            <name>egrep, sort, sed</name>
            <version>2.12, 8.13, 4.2.1</version>
            <command_line>
                <![CDATA[ egrep "^[0-9X]+\s" {DEEP_bed} | sort -V -k 1,2 | sed 's/^/chr/' > temp_region.bed ]]>
            </command_line>
            <loop>DEEP_bed</loop>
            <comment>Filter all uncompressed BED files and add prefix, make sure that output is sorted</comment>
        </tool>
        <tool>
            <name>gzip, egrep, sort, sed</name>
            <version>1.5, 2.12, 8.13, 4.2.1</version>
            <command_line>
                <![CDATA[ gunzip -c {DEEP_bed} | egrep "^[0-9X]+\s" | sort -V -k 1,2 | sed 's/^/chr/' > temp_region.bed ]]>
            </command_line>
            <loop>DEEP_bed (gzipped)</loop>
            <comment>Filter all gzipped BED files and add prefix, make sure that output is sorted</comment>
        </tool>
        <tool>
            <name>python3, numpy</name>
            <version>3.2.3, 1.6.2</version>
            <command_line>
                <![CDATA[ python3 adjust_score_column temp_region.bed ]]>
            </command_line>
            <loop>temp_region.bed</loop>
            <comment>Python3 function to adjust score column is implemented as part of the pipeline code and executed for all BED files by default</comment>
        </tool>
        <tool>
            <name>bedToBigBed</name>
            <version>2.6</version>
            <command_line>
                <![CDATA[ bedToBigBed -tab -type=bed6+n -as={field_names} temp_region.bed {chrom_sizes} {DEEPID.PROC.DATE.bigBed} ]]>
            </command_line>
            <loop>temp_region.bed</loop>
            <comment>Create final region files. n==1 for snRNA; n==3 for NOMe and broad peaks; n==4 for narrow peaks and long RNAs</comment>
        </tool>
	</software>
</process>
	<?xml version="1.0"?>
	<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
	<process>
	<name>THB</name>
	<version>2</version>
	<author>
	<name>Peter Ebert</name>
	<email>pebert@mpi-inf.mpg.de</email>
	</author>
	<description>
	This process merely describes the conversion - not production - of DEEP data files into an IHEC compatible format.
	If you have any questions about the actual data, please refer to the process XML files describing
	the data production and contact the author named in the respective file. The trackhub conversion process
	describes the conversion of standardized DEEP process output files into one of the BIG formats
	needed to submit the data as IHEC track hub. Since the reference assemblies used by IHEC are different
	to the ones used by DEEP, the conversion consists of the following steps:
	(i) filter data files for chromosomes 1-22 (hsa)/1-19 (mmu) and X,
	(ii) add "chr" prefix to chromosome names and
	(iii) for all BED or BED-like files, ensure that these represent a regular BED6+ file; in particular,
	the "score" column is adjusted by default to be in the range 0-1000 (for details about the
	formats used, please refer to https://genome.ucsc.edu/FAQ/FAQformat.html).
	The adjustment works as follows:
	select one meaningful column (e.g. coverage, signal enrichment or similar), bin the data according
	to the gray shading schema used by the UCSC genome browser (see link above) and then assign fix score
	values according according to the binning.
	</description>
	<inputs>
	<filetype>
	<identifier>DEEP_bigwig</identifier>
	<format>bigWig</format>
	<quantity>collection</quantity>
	<comment>bigWig output of a standardized DEEP process (libraries: histone, DNase, NOMe, WGBS; only raw/unfiltered signal tracks for histone and DNase libs)</comment>
	</filetype>
	<filetype>
	<identifier>DEEP_bed</identifier>
	<format>BED or BED-like</format>
	<quantity>collection</quantity>
	<comment>BED or BED-like output of a standardized DEEP process; comprises of histone, DNase and NOMe peaks and expressed small/long RNAs</comment>
	</filetype>
	</inputs>
	<references>
	<filetype>
	<identifier>chrom_sizes</identifier>
	<format>table</format>
	<quantity>collection</quantity>
	<comment>Common files containing information about the chromosome sizes for the respective assemblies</comment>
	</filetype>
	<filetype>
	<identifier>field_names</identifier>
	<format>AutoSQL</format>
	<quantity>collection</quantity>
	<comment>AutoSQL files describing the different BED files: narrowPeak, broadPeak, gNOMePeak, snRNAexpr, longRNAexpr</comment>
	</filetype>
	</references>
	<outputs>
	<filetype>
	<identifier>DEEPID.PROC.DATE.bigBed</identifier>
	<format>bigBed</format>
	<quantity>collection</quantity>
	<comment>Converted BED or BED-like files</comment>
	</filetype>
	<filetype>
	<identifier>DEEPID.PROC.DATE.bigWig</identifier>
	<format>bigWig</format>
	<quantity>collection</quantity>
	<comment>Converted bigWig files</comment>
	</filetype>
	</outputs>
	<software>
	<tool>
	<name>bigWigToBedGraph, egrep, sort, sed</name>
	<version>4, 2.12, 8.13, 4.2.1</version>
	<command_line>
	<![CDATA[ bigWigToBedGraph {DEEP_bigwig} stdout \| egrep "^[0-9X]+\s" \| sort -V -k 1,2 \| sed 's/^/chr/' > temp_signal.bg ]]>
	</command_line>
	<loop>DEEP_bigwig</loop>
	<comment>Filter all signal tracks and add prefix, make sure that output is sorted (should be by construction)</comment>
	</tool>
	<tool>
	<name>bedGraphToBigWig</name>
	<version>4</version>
	<command_line>
	<![CDATA[ bedGraphToBigWig temp_signal.bg {chrom_sizes} {DEEPID.PROC.DATE.bigWig} ]]>
	</command_line>
	<loop>temp_signal.bg</loop>
	<comment>Create final signal tracks</comment>
	</tool>
	<tool>
	<name>egrep, sort, sed</name>
	<version>2.12, 8.13, 4.2.1</version>
	<command_line>
	<![CDATA[ egrep "^[0-9X]+\s" {DEEP_bed} \| sort -V -k 1,2 \| sed 's/^/chr/' > temp_region.bed ]]>
	</command_line>
	<loop>DEEP_bed</loop>
	<comment>Filter all uncompressed BED files and add prefix, make sure that output is sorted</comment>
	</tool>
	<tool>
	<name>gzip, egrep, sort, sed</name>
	<version>1.5, 2.12, 8.13, 4.2.1</version>
	<command_line>
	<![CDATA[ gunzip -c {DEEP_bed} \| egrep "^[0-9X]+\s" \| sort -V -k 1,2 \| sed 's/^/chr/' > temp_region.bed ]]>
	</command_line>
	<loop>DEEP_bed (gzipped)</loop>
	<comment>Filter all gzipped BED files and add prefix, make sure that output is sorted</comment>
	</tool>
	<tool>
	<name>python3, numpy</name>
	<version>3.2.3, 1.6.2</version>
	<command_line>
	<![CDATA[ python3 adjust_score_column temp_region.bed ]]>
	</command_line>
	<loop>temp_region.bed</loop>
	<comment>Python3 function to adjust score column is implemented as part of the pipeline code and executed for all BED files by default</comment>
	</tool>
	<tool>
	<name>bedToBigBed</name>
	<version>2.6</version>
	<command_line>
	<![CDATA[ bedToBigBed -tab -type=bed6+n -as={field_names} temp_region.bed {chrom_sizes} {DEEPID.PROC.DATE.bigBed} ]]>
	</command_line>
	<loop>temp_region.bed</loop>
	<comment>Create final region files. n==1 for snRNA; n==3 for NOMe and broad peaks; n==4 for narrow peaks and long RNAs</comment>
	</tool>
	</software>
	</process>