From 9094b4eb44120ef7c5df0804feb853b3f6d9006e Mon Sep 17 00:00:00 2001
From: Peter Ebert <pebert@mpi-inf.mpg.de>
Date: Fri, 30 Dec 2016 15:52:40 +0100
Subject: [PATCH] ADD: track hub conversion processes (THB)

---
 docs/misc/THBv1.xml |  81 +++++++++++++++++++++++++
 docs/misc/THBv2.xml | 125 ++++++++++++++++++++++++++++++++++++++
 docs/misc/THBv3.xml | 142 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 348 insertions(+)
 create mode 100644 docs/misc/THBv1.xml
 create mode 100644 docs/misc/THBv2.xml
 create mode 100644 docs/misc/THBv3.xml
diff --git a/docs/misc/THBv1.xml b/docs/misc/THBv1.xml
new file mode 100644
index 0000000..213c675
--- /dev/null
+++ b/docs/misc/THBv1.xml
@@ -0,0 +1,81 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
+<process>
+    <name>THB</name>
+	<version>1</version>
+	<author>
+		<name>Peter Ebert</name>
+		<email>pebert@mpi-inf.mpg.de</email>
+	</author>
+	<description>
+		The trackhub_conv.py Python3 script adds the 'chr' prefix to the chromosome names and filters
+		for the chromosomes 1-22 / 1-19 and X,Y for reasons of compatibility of genomic coordinates between assemblies.
+		Note that the script just reads the folder contents and converts every file in the folder that appears
+		to be output of a DEEP process and to be a peak or bigwig file (based on file naming).
+		The converted files are put in the same folder.
+		Important: MACS2 outputs narrowPeak/broadPeak files that are not fully compliant to ENCODE standards,
+		the score column (index 5) has to be between 0-1000, so the conversion script rescales these values.
+		Please note that the peak name still refers to the original (unconverted) file.
+		Approximately 1 out 10 files is chosen at random and checked for consistency by reversing the conversion
+		(except for scaling of the score column in case of peak files) and computing the MD5 checksum,
+		which is then compared to the MD5 checksum of the original file after filtering
+		for the appropriate chromosomes as explained above.
+	</description>
+	<inputs>
+		<filetype>
+			<identifier>CHP_peaks</identifier>
+			<format>narrowPeak</format>
+			<quantity>collection</quantity>
+			<comment>Standard output of MACS2 in ENCODE narrowPeak format</comment>
+		</filetype>
+		<filetype>
+			<identifier>CHP_peaks</identifier>
+			<format>broadPeak</format>
+			<quantity>collection</quantity>
+			<comment>Standard output of MACS2 in ENCODE broadPeak format</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEP_bigwig</identifier>
+			<format>bigwig</format>
+			<quantity>collection</quantity>
+			<comment>Any bigwig output of a standardized DEEP process</comment>
+		</filetype>
+	</inputs>
+	<references>
+		<filetype>
+			<identifier>chrom_sizes</identifier>
+			<format>table</format>
+			<quantity>single</quantity>
+			<comment>File holding information on chromosome sizes for UCSC assembly (i.e. hg19, mm10)</comment>
+		</filetype>
+		<filetype>
+			<identifier>field_names</identifier>
+			<format>AutoSQL</format>
+			<quantity>collection</quantity>
+			<comment>Field_names is a folder containing files in AutoSQL format necessary for conversion of narrowPeak and broadPeak format into bigbed</comment>
+		</filetype>
+	</references>
+	<outputs>
+		<filetype>
+			<identifier>THB_peaks</identifier>
+			<format>bigbed</format>
+			<quantity>collection</quantity>
+			<comment>Converted peak files</comment>
+		</filetype>
+		<filetype>
+			<identifier>THB_bigwig</identifier>
+			<format>bigwig</format>
+			<quantity>collection</quantity>
+			<comment>Converted bigwig files</comment>
+		</filetype>
+	</outputs>
+	<software>
+		<tool>
+			<name>trackhub_conv.py</name>
+			<version>0.1</version>
+			<command_line><![CDATA[ trackhub_conv.py --folder $PWD --process THBv1 --chrom-table {chrom-sizes} --field-names {field-names} ]]></command_line>
+			<loop>CHP_peaks, DEEP_bigwig</loop>
+			<comment>Simple Python3 script to handle the batch conversion of files</comment>
+		</tool>
+	</software>
+</process>	
diff --git a/docs/misc/THBv2.xml b/docs/misc/THBv2.xml
new file mode 100644
index 0000000..1770803
--- /dev/null
+++ b/docs/misc/THBv2.xml
@@ -0,0 +1,125 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
+<process>
+    <name>THB</name>
+	<version>2</version>
+	<author>
+		<name>Peter Ebert</name>
+		<email>pebert@mpi-inf.mpg.de</email>
+	</author>
+    <description>
+		This process merely describes the conversion - not production - of DEEP data files into an IHEC compatible format.
+        If you have any questions about the actual data, please refer to the process XML files describing
+        the data production and contact the author named in the respective file. The trackhub conversion process
+        describes the conversion of standardized DEEP process output files into one of the BIG formats
+        needed to submit the data as IHEC track hub. Since the reference assemblies used by IHEC are different
+        to the ones used by DEEP, the conversion consists of the following steps:
+        (i) filter data files for chromosomes 1-22 (hsa)/1-19 (mmu) and X,
+        (ii) add &quot;chr&quot; prefix to chromosome names and
+        (iii) for all BED or BED-like files, ensure that these represent a regular BED6+ file; in particular,
+        the &quot;score&quot; column is adjusted by default to be in the range 0-1000 (for details about the
+        formats used, please refer to https://genome.ucsc.edu/FAQ/FAQformat.html).
+        The adjustment works as follows:
+        select one meaningful column (e.g. coverage, signal enrichment or similar), bin the data according
+        to the gray shading schema used by the UCSC genome browser (see link above) and then assign fix score
+        values according according to the binning.
+	</description>
+	<inputs>
+		<filetype>
+			<identifier>DEEP_bigwig</identifier>
+			<format>bigWig</format>
+			<quantity>collection</quantity>
+			<comment>bigWig output of a standardized DEEP process (libraries: histone, DNase, NOMe, WGBS; only raw/unfiltered signal tracks for histone and DNase libs)</comment>
+		</filetype>
+        <filetype>
+            <identifier>DEEP_bed</identifier>
+            <format>BED or BED-like</format>
+            <quantity>collection</quantity>
+            <comment>BED or BED-like output of a standardized DEEP process; comprises of histone, DNase and NOMe peaks and expressed small/long RNAs</comment>
+        </filetype>
+    </inputs>
+	<references>
+		<filetype>
+			<identifier>chrom_sizes</identifier>
+			<format>table</format>
+			<quantity>collection</quantity>
+			<comment>Common files containing information about the chromosome sizes for the respective assemblies</comment>
+		</filetype>
+		<filetype>
+			<identifier>field_names</identifier>
+			<format>AutoSQL</format>
+			<quantity>collection</quantity>
+			<comment>AutoSQL files describing the different BED files: narrowPeak, broadPeak, gNOMePeak, snRNAexpr, longRNAexpr</comment>
+		</filetype>
+	</references>
+	<outputs>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.bigBed</identifier>
+			<format>bigBed</format>
+			<quantity>collection</quantity>
+			<comment>Converted BED or BED-like files</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.bigWig</identifier>
+			<format>bigWig</format>
+			<quantity>collection</quantity>
+			<comment>Converted bigWig files</comment>
+		</filetype>
+	</outputs>
+	<software>
+		<tool>
+			<name>bigWigToBedGraph, egrep, sort, sed</name>
+			<version>4, 2.12, 8.13, 4.2.1</version>
+			<command_line>
+                <![CDATA[ bigWigToBedGraph {DEEP_bigwig} stdout | egrep "^[0-9X]+\s" | sort -V -k 1,2 | sed 's/^/chr/' > temp_signal.bg ]]>
+            </command_line>
+            <loop>DEEP_bigwig</loop>
+			<comment>Filter all signal tracks and add prefix, make sure that output is sorted (should be by construction)</comment>
+		</tool>
+        <tool>
+            <name>bedGraphToBigWig</name>
+			<version>4</version>
+			<command_line>
+                <![CDATA[ bedGraphToBigWig temp_signal.bg {chrom_sizes} {DEEPID.PROC.DATE.bigWig} ]]>
+            </command_line>
+            <loop>temp_signal.bg</loop>
+			<comment>Create final signal tracks</comment>
+        </tool>
+        <tool>
+            <name>egrep, sort, sed</name>
+            <version>2.12, 8.13, 4.2.1</version>
+            <command_line>
+                <![CDATA[ egrep "^[0-9X]+\s" {DEEP_bed} | sort -V -k 1,2 | sed 's/^/chr/' > temp_region.bed ]]>
+            </command_line>
+            <loop>DEEP_bed</loop>
+            <comment>Filter all uncompressed BED files and add prefix, make sure that output is sorted</comment>
+        </tool>
+        <tool>
+            <name>gzip, egrep, sort, sed</name>
+            <version>1.5, 2.12, 8.13, 4.2.1</version>
+            <command_line>
+                <![CDATA[ gunzip -c {DEEP_bed} | egrep "^[0-9X]+\s" | sort -V -k 1,2 | sed 's/^/chr/' > temp_region.bed ]]>
+            </command_line>
+            <loop>DEEP_bed (gzipped)</loop>
+            <comment>Filter all gzipped BED files and add prefix, make sure that output is sorted</comment>
+        </tool>
+        <tool>
+            <name>python3, numpy</name>
+            <version>3.2.3, 1.6.2</version>
+            <command_line>
+                <![CDATA[ python3 adjust_score_column temp_region.bed ]]>
+            </command_line>
+            <loop>temp_region.bed</loop>
+            <comment>Python3 function to adjust score column is implemented as part of the pipeline code and executed for all BED files by default</comment>
+        </tool>
+        <tool>
+            <name>bedToBigBed</name>
+            <version>2.6</version>
+            <command_line>
+                <![CDATA[ bedToBigBed -tab -type=bed6+n -as={field_names} temp_region.bed {chrom_sizes} {DEEPID.PROC.DATE.bigBed} ]]>
+            </command_line>
+            <loop>temp_region.bed</loop>
+            <comment>Create final region files. n==1 for snRNA; n==3 for NOMe and broad peaks; n==4 for narrow peaks and long RNAs</comment>
+        </tool>
+	</software>
+</process>	
diff --git a/docs/misc/THBv3.xml b/docs/misc/THBv3.xml
new file mode 100644
index 0000000..19ccf30
--- /dev/null
+++ b/docs/misc/THBv3.xml
@@ -0,0 +1,142 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/css" href="http://deep.mpi-inf.mpg.de/DAC/files/style/deep_process_style.css"?>
+<process>
+    <name>THB</name>
+	<version>3</version>
+	<author>
+		<name>Peter Ebert</name>
+		<email>pebert@mpi-inf.mpg.de</email>
+	</author>
+    <description>
+        This process merely describes the conversion - not production - of DEEP data files into an IHEC compatible format.
+        If you have any questions about the actual data, please refer to the process XML files describing
+        the data production and contact the author named in the respective file. The trackhub conversion process
+        describes the conversion of standardized DEEP process output files into one of the BIG formats
+        needed to submit the data as IHEC track hub. Since the reference assemblies used by IHEC are different
+        to the ones used by DEEP, the conversion consists of the following steps:
+        (i) filter data files for chromosomes 1-22 (hsa)/1-19 (mmu) and X,
+        (ii) add &quot;chr&quot; prefix to chromosome names and
+        (iii) for all BED or BED-like files, ensure that these represent a regular BED6+ file; in particular,
+        the &quot;score&quot; column is adjusted by default to be in the range 0-1000 (for details about the
+        formats used, please refer to https://genome.ucsc.edu/FAQ/FAQformat.html).
+        The adjustment works as follows:
+        select one meaningful column (e.g. coverage, signal enrichment or similar), bin the data according
+        to the gray shading schema used by the UCSC genome browser (see link above) and then assign fix score
+        values according according to the binning.
+        Version 3 of the THB process also creates a mapping between filename and track property
+        (~ what does this data represent?) as required by the updated IHEC trackhub specification (JSON format).
+	</description>
+	<inputs>
+		<filetype>
+			<identifier>DEEP_signal</identifier>
+			<format>bigWig or bedGraph</format>
+			<quantity>collection</quantity>
+			<comment>bigWig output of a standardized DEEP process (libraries: histone, DNase, NOMe, WGBS; only raw/unfiltered signal tracks for histone and DNase libs)</comment>
+		</filetype>
+        <filetype>
+            <identifier>DEEP_bed</identifier>
+            <format>BED or BED-like</format>
+            <quantity>collection</quantity>
+            <comment>BED or BED-like output of a standardized DEEP process; comprises of histone, DNase and NOMe peaks and expressed small/long RNAs</comment>
+        </filetype>
+    </inputs>
+	<references>
+		<filetype>
+			<identifier>chrom_sizes</identifier>
+			<format>table</format>
+			<quantity>collection</quantity>
+			<comment>Common files containing information about the chromosome sizes for the respective assemblies</comment>
+		</filetype>
+		<filetype>
+			<identifier>field_names</identifier>
+			<format>AutoSQL</format>
+			<quantity>collection</quantity>
+			<comment>AutoSQL files describing the different BED files: narrowPeak, broadPeak, gNOMePeak, snRNAexpr, longRNAexpr</comment>
+		</filetype>
+	</references>
+	<outputs>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.bigBed</identifier>
+			<format>bigBed</format>
+			<quantity>collection</quantity>
+			<comment>Converted BED or BED-like files</comment>
+		</filetype>
+		<filetype>
+			<identifier>DEEPID.PROC.DATE.bigWig</identifier>
+			<format>bigWig</format>
+			<quantity>collection</quantity>
+			<comment>Converted bigWig files</comment>
+		</filetype>
+        <filetype>
+			<identifier>DACID.PROC.DATE.prop.tsv</identifier>
+			<format>tab separated table</format>
+			<quantity>single</quantity>
+			<comment>trackhub property mapping</comment>
+		</filetype>
+	</outputs>
+	<software>
+		<tool>
+			<name>bigWigToBedGraph, egrep, sort, sed</name>
+			<version>4, 2.12, 8.13, 4.2.1</version>
+			<command_line>
+                <![CDATA[ bigWigToBedGraph {DEEP_signal} stdout | egrep "^[0-9X]+\s" | sort -V -k 1,2 | sed 's/^/chr/' > temp_signal.bg ]]>
+            </command_line>
+            <loop>DEEP_bigwig</loop>
+			<comment>Filter all signal tracks and add prefix, make sure that output is sorted (should be by construction)</comment>
+		</tool>
+        <tool>
+            <name>bedGraphToBigWig</name>
+			<version>4</version>
+			<command_line>
+                <![CDATA[ bedGraphToBigWig temp_signal.bg {chrom_sizes} {DEEPID.PROC.DATE.bigWig} ]]>
+            </command_line>
+            <loop>temp_signal.bg</loop>
+			<comment>Create final signal tracks</comment>
+        </tool>
+        <tool>
+            <name>egrep, sort, sed</name>
+            <version>2.12, 8.13, 4.2.1</version>
+            <command_line>
+                <![CDATA[ egrep "^[0-9X]+\s" {DEEP_bed} | sort -V -k 1,2 | sed 's/^/chr/' > temp_region.bed ]]>
+            </command_line>
+            <loop>DEEP_bed</loop>
+            <comment>Filter all uncompressed BED files and add prefix, make sure that output is sorted</comment>
+        </tool>
+        <tool>
+            <name>gzip, egrep, sort, sed</name>
+            <version>1.5, 2.12, 8.13, 4.2.1</version>
+            <command_line>
+                <![CDATA[ gunzip -c {DEEP_bed} | egrep "^[0-9X]+\s" | sort -V -k 1,2 | sed 's/^/chr/' > temp_region.bed ]]>
+            </command_line>
+            <loop>DEEP_bed (gzipped)</loop>
+            <comment>Filter all gzipped BED files and add prefix, make sure that output is sorted</comment>
+        </tool>
+        <tool>
+            <name>python3, numpy</name>
+            <version>3.2.3, 1.6.2</version>
+            <command_line>
+                <![CDATA[ python3 adjust_score_column temp_region.bed ]]>
+            </command_line>
+            <loop>temp_region.bed</loop>
+            <comment>Python3 function to adjust score column is implemented as part of the pipeline code and executed for all BED files by default</comment>
+        </tool>
+        <tool>
+            <name>bedToBigBed</name>
+            <version>2.6</version>
+            <command_line>
+                <![CDATA[ bedToBigBed -tab -type=bed6+{N} -as={field_names} temp_region.bed {chrom_sizes} {DEEPID.PROC.DATE.bigBed} ]]>
+            </command_line>
+            <loop>temp_region.bed</loop>
+            <comment>Create final region files. N==1 for snRNA; N==3 for NOMe and broad peaks; N==4 for narrow peaks and long RNAs</comment>
+        </tool>
+        <tool>
+            <name>python3</name>
+            <version>3.2.3</version>
+            <command_line>
+                <![CDATA[ python3 write_track_propmap {DACID.PROC.DATE.prop.tsv} ]]>
+            </command_line>
+            <loop>no looping</loop>
+            <comment>Python3 function to write the track property mapping (filename to property) to a text file</comment>
+        </tool>
+	</software>
+</process>