docs/quantification/transcriptome/LXPv1.xml

<?xml version="1.0"?>
<process>
	<name>LXP</name>
	<version>1</version>
	<author>
		<name>Anupam Sinha</name>
		<email>a.sinha@ikmb.uni-kiel.de</email>
	</author>
	<!-- Precise description of what this process does, what output is generated and what statistics are computed -->
	<description>
		* htseq-count: Generates read counts on the gene level.
		* cufflinks: Generates FPKM values for genes and transcript isoforms.
		* StringTie: Generates FPKM values for genes and transcript isoforms. Also generates .ctab files for analysis using Ballgown.
	</description>
	<!-- Following section: list input files [samples to be analysed and similar] -->
	<inputs>
		<filetype>
			<identifier>.bam</identifier>
			<format></format>
			<quantity>single</quantity>
			<comment>Unfiltered aligned reads</comment>
		</filetype>
	</inputs>
	<!-- Following section: list reference files [e.g. reference genomes] used in this process -->
	<references>
		<filetype>
			<identifier>gencode.v19.annotation.gtf</identifier>
			<format>GTF</format>
			<quantity>single</quantity>
			<comment>Gencode gene annotation file in gene transfer format.</comment>
		</filetype>
		<filetype>
			<identifier>reference.fa</identifier>
			<format>multi fasta</format>
			<quantity>single</quantity>
			<comment>The reference genome file; see aspera.dkfz.de > download > results > references > genomes > human > WholeGenome</comment>
		</filetype>
	</references>
	<!-- Following section: list output files of process [e.g. fpkm files, read counts files from htseq etc.] -->
	<outputs>
		<filetype>
			<identifier>[sampleID].LXPv1.[DATE].readcounts.txt</identifier>
			<format>text file</format>
			<quantity>single</quantity>
			<comment>This file contains the read counts on the gene level.</comment>
		</filetype>
		<filetype>
			<identifier>[sampleID].LXPv1.[DATE].genes.fpkm.tracking</identifier>
			<format>text file</format>
			<quantity>single</quantity>
			<comment>Output file containing the FPKM counts on the gene level.</comment>
		</filetype>
		<filetype>
			<identifier>[sampleID].LXPv1.[DATE].isoforms.fpkm.tracking</identifier>
			<format>text file</format>
			<quantity>single</quantity>
			<comment>Output file containing the FPKM counts on the isoform level.</comment>
		</filetype>
		<filetype>
			<identifier>[sampleID].LXPv1.[DATE].transcripts.gtf</identifier>
			<format>gene transfer format</format>
			<quantity>single</quantity>
			<comment>This file contains assembled transcripts.</comment>
		</filetype>
		<filetype>
                        <identifier>[sampleID].LXPv1.[DATE].stringtie.gtf</identifier>
                        <format>gene transfer format</format>
                        <quantity>single</quantity>
                        <comment>This file contains assembled transcripts.</comment>
                </filetype>
		<filetype>
                        <identifier>[sampleID].LXPv1.[DATE].ballgown</identifier>
                        <format>tab separated fields (.ctab) format</format>
                        <quantity>five</quantity>
                        <comment>This is a folder containing 5 .ctab files. These .ctab files contain the expression values of exons, introns and transcripts. Two files list the internal(generated by ballgown) association ids between exons, introns, and transcripts.</comment>
                </filetype>
	</outputs>

	<software>
		<tool>
			<name>Python</name>
			<version>2.7</version>
			<command_line><![CDATA[ CMDLINE ]]></command_line>
			<loop>no looping</loop>
			<comment></comment>
		</tool>
		<tool>
			<name>Samtools</name>
			<version>0.1.19-44428cd</version>
			<command_line><![CDATA[ CMDLINE ]]></command_line>
			<loop>no looping</loop>
			<comment></comment>
		</tool>
		<tool>
			<name>htseq-count</name>
			<version>0.6.1p1</version>		
			<command_line>samtools sort -n -@ 8 -m 4G ${_sample}.bam ${_sample}_sorted
			samtools/samtools view -F 256 ${_sample}_sorted.bam > ${_sample}.sam
			htseq-count -s reverse -m union -a 20 ${_sample}.sam gencode.v19.annotation.gtf > ${_sample}_htseq.txt
			</command_line>
			<loop>no looping</loop>
			<comment>DESeq2 requires bam files sorted by read name (step 1). After sorting, all non-primary alignments are removed during the bam to sam conversion. \
			Invoking htseq-count counts the number of reads per gene.  \
			Please see http://www-huber.embl.de/users/anders/HTSeq/doc/count.html#count for further information.
			</comment>
		</tool>
		<tool>
			<name>cufflinks</name>
			<version>v2.0.2</version>		
			<command_line>
				<![CDATA[
					cufflinks -p 16 --frag-bias-correct reference.fa --multi-read-correct --library-type fr-firststrand
					--compatible-hits-norm -G gencode.v19.annotation_transcripts_only.gtf ${_sample}.bam
				]]>
			</command_line>
			<loop>no looping</loop>
			<comment>Please see http://cufflinks.cbcb.umd.edu/manual.html for further information.</comment>
		</tool>
		<tool>
			<name>StringTie</name>
			<version>v1.0.3</version>
			<command_line>
				<![CDATA[
					stringtie -p 16 -e -b ${_sample}.ballgown -o ${_sample}_stringtie.gtf -G gencode.v19.annotation_transcripts_only.gtf
				]]>
            </command_line>
			<loop>no looping</loop>
			<comment>Please see http://ccb.jhu.edu/software/stringtie/ for further information. \
			"-b" option creates a folder which contains the .ctab files for analysis using Ballgown. \
			Please see https://github.com/alyssafrazee/ballgown for further information.
			</comment>
		</tool>

	</software>
</process>