pipeline.nf

#!/usr/bin/env nextflow

//setting default values
	params.bigwig=""
	params.bed=""
	params.genome_fasta=""
	params.motif_db=""
	params.config=""
	params.tfbs_path=""
	params.create_known_tfbs_path = "./"
	params.help = 0
	params.gtf_path=""
	params.out = "./out/"

//peak_calling
	params.window_length = 200
	params.step = 100
	params.percentage = 0

//filter_unknown_motifs
	params.min_size_fp=10
	params.max_size_fp=100

//clustering
  	//reduce_sequence
  	params.kmer=10
  	params.aprox_motif_len=10
  	params.motif_occurence=1
  	params.min_seq_length=10

  	//cdhit_wrapper
  	params.global=0
  	params.identity=0.8
  	params.sequence_coverage=8
  	params.memory=800
 	params.throw_away_seq=9
  	params.strand=0

//motif_estimation
	//bed_to_clustered_fasta
	params.min_seq = 100  // Minimum number of sequences in the fasta-files for glam2

	//glam2
	params.motif_min_key = 8 // Minimum number of key positions (aligned columns)
	params.motif_max_key = 20 // Maximum number of key positions (aligned columns)
	params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime.

	//tomtom
	params.tomtom_treshold = 0.01 // threshold for similarity score.

	//cluster motifs
	params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not
	params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph
	motif_similarity_thresh = 0.00001 // threshold for motif similarity score

	params.best_motif = 3 // Top n motifs per cluster

//creating_gtf
	params.organism=""
	params.tissue=""

if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){
log.info """
Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file]

Required arguments:
	--bigwig		 Path to BigWig-file
	--bed			 Path to BED-file
	--genome_fasta		 Path to genome in FASTA-format
	--motif_db		 Path to motif-database in MEME-format
	--config		 Path to UROPA configuration file
 	--organism 		 Input organism [hg38 | hg19 | mm9 | mm10]
	--out			 Output Directory (Default: './out/')

Optional arguments:

	--help [0|1]		1 to show this help message. (Default: 0)
	--tfbs_path 		Path to directory with output from tfbsscan. If given tfbsscan will not be run.
	--create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored.
				 Path can be set as tfbs_path in next run. (Default: './')
	--gtf_path			Path to gtf-file. If path is set the process which creats a gtf-file is skipped.

	Footprint extraction:
	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
	--percentage INT	Threshold in percent (Default: 0)

	Filter unknown motifs:
	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)
	--max_size_fp INT	Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100)

	Clustering:
	Sequence preparation/ reduction:
	--kmer INT		Kmer length (Default: 10)
	--aprox_motif_len INT	Motif length (Default: 10)
	--motif_occurence FLOAT	Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
	--min_seq_length Interations	Remove all sequences below this value. (Default: 10)

	Clustering:
	--global INT		Global (=1) or local (=0) alignment. (Default: 0)
	--identity FLOAT	Identity threshold. (Default: 0.8)
	--sequence_coverage INT	Minimum aligned nucleotides on both sequences. (Default: 8)
	--memory INT		Memory limit in MB. 0 for unlimited. (Default: 800)
	--throw_away_seq INT	Remove all sequences equal or below this length before clustering. (Default: 9)
	--strand INT		Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)

	Motif estimation:
	--min_seq INT 		Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100)
	--motif_min_key INT	Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8)
	--motif_max_key INT	Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20)
	--iteration INT		Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000)
	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
	--best_motif INT	Get the best X motifs per cluster. (Default: 3)

	Moitf clustering:
	--cluster_motif	Boolean	If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0)
	--edge_weight INT	Minimum weight of edges in motif-cluster-graph (Default: 5)
	--motif_similarity_thresh FLOAT	Threshold for motif similarity score (Default: 0.00001)

	Creating GTF:
	--tissues List/String 	List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON
				config
All arguments can be set in the configuration files
 ```
"""
System.exit(2)
} else {
	Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input}
	Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan}
	Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
	Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom}
	Channel.fromPath(params.config).set {config}
}

/*
Checking for parameter input!
*/
int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer",
             "aprox_motif_len", "motif_occurence", "min_seq_length", "global",
             "sequence_coverage", "memory", "throw_away_seq", "strand",
             "min_seq", "motif_min_key", "motif_max_key", "iteration",
             "edge_weight", "best_motif"]
req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"]

valid_organism = ["hg38", "hg19", "mm9", "mm10"]

params.each { key, value ->
  if(int_params.contains(key)) {
    if (!("${value}" ==~ /\d+/ )){
      println("ERROR: $key needs to be an Integer")
      System.exit(2)
    }
  }
  if(req_params.contains(key)) {
    if(!file(value).exists()) {
      println("ERROR: $key not found. Please check the given path.")
      System.exit(2)
    }
  }
}
if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){
  println("ERROR: --identity needs to be float in range 0.8 to 1.0")
  System.exit(2)
}

if (!valid_organism.contains(params.organism)) {
  println("ERROR: Invalid organism! Valid organisms: " + valid_organism)
  System.exit(2)
}
if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){
  println("ERROR: --percentage needs to be an Integer between 0 and 100")
  System.exit(2)
}

if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) {
  println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01")
  System.exit(2)
}
if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) {
  println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01")
  System.exit(2)
}


path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin

bigwig_input.combine(bed_input).set{footprint_in}
/*
This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest
*/
process footprint_extraction {
	conda "${path_env}"

	tag{name}
	publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed'
	publishDir "${params.out}/1.1_footprint_extraction/log", mode: 'copy', pattern: '*.log'

	input:
	set name, file (bigWig), file (bed) from footprint_in

	output:
	set name, file ('*.bed') into bed_for_overlap_with_TFBS

	script:
	"""
	python ${path_bin}/1.1_footprint_extraction/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage}
	"""
}

for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan)

/*

*/
process extract_known_TFBS {

	conda "${path_env}"

	publishDir "${params.out}/1.2_filter_motifs/TFBSscan/", mode: 'copy', pattern: '*.bed'

	input:
	set file (fasta), file (db), file (bed) from for_tfbs

	output:
	val ('done') into known_TFBS_for_overlap

	when:
	params.tfbs_path == ""

	script:
	"""
	python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed}
	"""
}

/*
Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known_TFBS is skipped, so the paths are differnt.
*/
if(params.tfbs_path == "") {
	bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap}
	motif_path = params.create_known_tfbs_path
} else {
	Channel.from('skipped').set {workaround}
	bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap}
	tfbs = params.tfbs_path?.endsWith('/') ? params.tfbs_path: params.tfbs_path.substring( 0, params.tfbs_path.length() -1 )
	motif_path = tfbs
}


/*
*/
process overlap_with_known_TFBS {
	conda "${path_env}"
	publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy'

	input:
	set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap

	output:
	set name, file ("${name}_unknown.bed") into bed_for_reducing
	file ('*.stats')

	script:
	motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"")
	"""
	${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp}
	"""
}


/*
Reduce each sequence to its most conserved region.
*/
process reduce_sequence {
	conda "${path_env}"
	echo true
	publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy'

	input:
	set name, file (bed) from bed_for_reducing

	output:
	set name, file ('*.bed') into bed_for_clustering

	script:
	"""
	Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
	"""
}


/*
Cluster all sequences. Appends a column with cluster numbers to the bed-file.
*/
process clustering {
	conda "${path_env}"
	echo true

	publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed'

	input:
	set name, file (bed) from bed_for_clustering

	output:
	set name, file ('*.bed') into bed_for_motif_esitmation

	script:
	"""
	Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads}
	"""
}


/*
Converting BED-File to one FASTA-File per cluster
*/
process bed_to_clustered_fasta {

	conda "${path_env}"
	publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy'
    tag{name}

    input:
    set name, file (bed) from bed_for_motif_esitmation

    output:
    file ('*.FASTA') into fasta_for_glam2
    file ('*.FASTA') into fasta_for_motif_cluster

    script:
    """
    Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq}
    """
}


//flatten list and adding name of file to channel value
fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]}
//combine fasta files in one list
fasta_for_motif_cluster = fasta_for_motif_cluster.toList()

/*
Running GLAM2 on FASTA-files.
Generating Motifs through alignment and scoring best local matches.
*/
process glam2 {

    tag{name}
	publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy'
	conda "${path_env}"

    input:
    set name, file (fasta) from fasta_for_glam2

	output:
    file("${name}/*.meme") into meme_to_merge
	set name, file("${name}/*.meme") into meme_for_tomtom
	set name, file("${name}/*.meme") into meme_to_search_in_merged
	set name, file("${name}/*.meme") into meme_for_filter
	file ('*')

    script:
    """
    glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
    """
}

/*
Combining all MEME-files to one big MEME-file.
The paths are sorted numerically depending on the cluster ID.
*/
process merge_meme {

	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy'
	conda "${path_env}"

	input:
	val (memelist) from meme_to_merge.toList()

	output:
	file ('merged_meme.meme') into merged_meme

	when:
	params.cluster_motif == 1

	script:
	//sorting
	memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
	meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
	meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"}
	//create list for bash
	meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"")
	"""
	meme2meme ${meme_list} > merged_meme.meme
	"""
}

to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme)

/*
Running Tomtom on merged meme-files.
Output table has the information which clusters are similar to each other.
*/
process find_similar_motifs {

	tag{name}
	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy'
	conda "${path_env}"

	input:
	set name, file (meme) ,file (merged_meme) from to_find_similar_motifs

	output:
	set name, file ("${name}.tsv") into motif_similarity

	when:
	params.cluster_motif == 1

	script:
	"""
	tomtom ${meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}.tsv
	"""
}

/*
Label first column of TSV-file with Cluster ID
*/
process label_cluster {

	tag{name}
	conda "${path_env}"

	input:
	set name, file (tsv) from motif_similarity

	output:
	file ("${name}_labeled.tsv") into labeled_tsv

	when:
	params.cluster_motif == 1

	script:
	"""
	Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv
	"""
}

/*
Merging tsv files_for_merge_fasta
*/
process merge_labeled_tsv {

	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy'

	input:
	val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer }

	output:
	file ('*.tsv') into merged_labeled_tsv

	when:
	params.cluster_motif == 1

	script:
	tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"")
	"""
	echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv
	for i in $tsvs; do
		cat \$i >> merged_labeled.tsv
	done
	"""
}

files_for_merge_fasta = merged_labeled_tsv.combine(fasta_for_motif_cluster)


/*
Merging FASTA-files of similar clusters
*/
process merge_fasta {

	conda "${path_env}"
	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy'

	input:
	set file (motiv_sim), val (fasta_list) from files_for_merge_fasta

	output:
	file ('*.fasta') into motif_clustered_fasta_list
  	file('*.png')

  	when:
  	params.cluster_motif == 1

	script:
  	fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
  	fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
	"""
	Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight}
	"""
}


motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()

/*
Run GLAM2 on emrged FASTA-files
*/
process clustered_glam2 {

	publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy'
	tag{name}
	conda "${path_env}"

	input:
	file (fasta) from motif_clustered_fasta_flat

	output:
	set name, file ('*.meme') into clustered_meme_for_tomtom
  	set name, file ('*.meme') into clustered_meme_for_filter
  	file('*')

	when:
	params.cluster_motif == 1

	script:
  	name = fasta.getBaseName()
	"""
	glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
	"""
}

/*
Forward files depending on set parameter
If motif clustering is activated or not.
*/
if(params.cluster_motif == 1){
	for_tomtom = clustered_meme_for_tomtom
	for_filter = clustered_meme_for_filter
} else {
	for_tomtom = meme_for_tomtom
	for_filter = meme_for_filter
}


/*
Running Tomtom on meme-files generated by GLAM2.
Tomtom searches motifs in databases.
*/
process tomtom {

    tag{name}
	publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy'
	conda "${path_env}"

    input:
    set name, file (meme) from for_tomtom

    output:
    set name, file ('*.tsv') into tsv_for_filter

    script:
    """
    tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
    """
}


//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter2 = for_filter.join( tsv_for_filter )
for_filter2
    .filter { name, meme, tsv ->
        long count = tsv.readLines().size()
        count <= 1
        }
    .into { meme_for_scan; check }


//If channel 'check' is empty print errormessage
process check_for_unknown_motifs {
    echo true

    input:
    val x from check.ifEmpty('EMPTY')

    when:
    x == 'EMPTY'

    """
    echo '>>> STOPPED: No unknown Motifs were found.'
    """

}


//Get the best(first) Motif from each MEME-file
process get_best_motif {

	conda "${path_env}"
	tag{name}
	publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy'

	input:
	set name, file(meme), file(tsv) from meme_for_scan

	output:
	set name, file('*_best.meme') into best_motif

	script:
	"""
	python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif}
	"""
}


best_motif.combine(fa_scan).set {files_for_genome_scan}

/*
process genome_scan {
	conda "${path_env}"

	input:
	set name, file(meme), file(fasta) from files_for_genome_scan

	output:
	file ('.bed') into bed_for_uropa, bed_for_cluster_quality

	script:
	"""
	"""
}


process cluster_quality {

	input:
	file (bed) from bed_for_cluster_quality

	output:
	file ('*.bed') into bed_for_final_filter

	script:
	"""
	"""
} */


process create_GTF {
	conda "${path_env}"

	publishDir "${params.out}/3.1_create_gtf/", mode: 'copy'

	output:
	file ('*.gtf') into gtf

	when:
	params.gtf_path == ""

	script:
	"""
	python ${path_bin}/3.1_create_gtf/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}/3.1_create_gtf
	"""
}

if (params.gtf_path == "") {
	gtf_for_uropa = gtf
} else {
	gtf_for_uropa = Channel.fromPath(params.gtf_path)
}

/*
bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}


// Create configuration file for UROPA.
// Takes template and replaces bed- and gtf-placeholders with actual paths.
process create_uropa_config {

	publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'

	input:
	set val(bed), val(gtf) from uropa_in.toList()
	file (conf) from config

	output:
	file ('uropa.config') into uropa_config

	script:
	"""
  	sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final
	"""
}


process UROPA {

	input:
	file (config) from uropa_config

	output:
	set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter

	script:
	"""
	"""
}


process filter {

	input:

	output:

	script:
	"""
	"""
} */

workflow.onComplete {
log.info"""
Pipeline execution summary
---------------------------
Completed at: ${workflow.complete}
Duration    : ${workflow.duration}
Success     : ${workflow.success}
workDir     : ${workflow.workDir}
exit status : ${workflow.exitStatus}
Error report: ${workflow.errorReport ?: '-'}
"""
}