pipeline.nf

//!/usr/bin/env nextflow

Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input}
Channel.fromPath(params.bed).set {bed_input}
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom}
Channel.fromPath(params.config).set {config}

//setting default values
params.input=""
params.bed=""
params.genome_fasta=""
params.jaspar_db=""
params.config=""

//peak_calling
	params.window_length = 200
	params.step = 100
	params.percentage = 0

//filter_unknown_motifs
	params.min_size_fp=10
	params.max_size_fp=100

//clustering
  //reduce_bed
  params.kmer=10
  params.aprox_motif_len=10
  params.motif_occurence=1
  params.min_seq_length=10

  //cdhit_wrapper
  params.global=0
  params.identity=0.8
  params.sequence_coverage=8
  params.memory=800
  params.throw_away_seq=9
  params.strand=0

//motif_estimation
	//bed_to_clustered_fasta
	params.min_seq = 10  // Minimum number of sequences in the fasta-files for glam2

	//glam2
	params.motif_min_len = 8 // Minimum length of Motifs
	params.motif_max_len = 20 // Maximum length of Motifs
	params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime.

	//tomtom
	params.tomtom_treshold = 0.01 // threshold for similarity score.

//creating_gtf
	params.organism="homo_sapiens"
	params.tissue=""


if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){
log.info """
Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file]

Required arguments:
	--input Path to BigWig-file
	--bed Path to BED-file
	--genome_fasta Path to genome in FASTA-format
	--jaspar_db Path to motif-database in MEME-format


Optional arguments:
	Footprint extraction:
	--window_length INT (Default: 200)
	--step INT (Default: 100)
	--percentage INT(Default: 0)

	Filter unknown motifs:
	--min_size_fp INT (Default: 10)
	--max_size_fp INT (Default: 100)

	Clustering:
	  Sequence preparation/ reduction:
	  --kmer INT Kmer length (Default: 10)
	  --aprox_motif_len INT Motif length (Default: 10)
	  --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
	  --min_seq_length INT Remove all sequences below this value. (Default: 10)

	  Clustering:
	  --global INT Global (=1) or local (=0) alignment. (Default: 0)
	  --identity FLOAT Identity threshold. (Default: 0.8)
	  --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8)
	  --memory INT Memory limit in MB. 0 for unlimited. (Default: 800)
	  --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9)
	  --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0)

	Motif estimation:
	--motif_min_len INT	Minimum length of Motif (Default: 8)
	--motif_max_len INT	Maximum length of Motif (Default: 20)
	--interation INT	Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000)
	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)

	Creating GTF:
	--organism [homo_sapiens | mus_musculus]
	--tissues

All arguments can be set in the configuration files.
"""
}


bigwig_input.combine(bed_input).into {footprint_in}
/*

*/
process footprint_extraction {
	conda "${path_env}"

	tag{name}
	publishDir "${out}", mode: 'copy', pattern: '*.bed'
	publishDir "${out}/log", mode: 'copy', pattern: '*.log'

	input:
	set name, file (bigWig), file (bed) from footprint_in

	output:
	set name, file ('*.bed') into bed_for_overlap_with_TFBS

	script:
	"""
	python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage}
	"""
}

//Abfrage ob ausgeführt werden muss.
/*

*/
process extract_known_TFBS {
	conda "${path_env}"

	input:
	file (fasta) from fa_overlap
	file (db) from db_for_motivscan

	output:
	file ('*.bed') into known_TFBS_for_overlap

	script:
	"""
	python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./
	"""
}


bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap}


/*
*/
process overlap_with_known_TFBS {
	conda "${path_env}"

	input:
	set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap

	output:
	set name, file ('*.bed') into bed_for_reducing

	script:
	motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"")
	"""
	${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp}
	"""
}


/*
*/
process reduce_bed {
	conda "${path_env}"

	input:
	set name, file (bed) from bed_for_reducing

	output:
	set name, file ('*.bed') into bed_for_clustering

	script:
	"""
	Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length}
	"""
}


/*
*/
process clustering {
	conda "${path_env}"

	input:
	set name, file (bed) from bed_for_clustering

	output:
	set name, file ('*.bed') into bed_for_motif_esitmation

	script:
	"""
	Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads}
	"""
}


/*
Converting BED-File to one FASTA-File per cluster
*/
process bed_to_clustered_fasta {
		conda "${path_env}"

    tag{name}
    publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy'

    input:
    set name, file (bed) from bed_for_motif_esitmation

    output:
    file ('*.FASTA') into fasta_for_glam2

    script:
    """
    Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
    """
}


//flatten list and adding name of file to channel value
fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]}


/*
Running GLAM2 on FASTA-files.
Generating Motifs through alignment and scoring best local matches.
*/
process glam2 {
		conda "${path_env}"

    tag{name}

    input:
    set name, file (fasta) from fasta_for_glam2

    output:
    set name, file('*.meme') into meme_for_tomtom, meme_for_filter

    script:
    """
    glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation}
    """
}

/*
Running Tomtom on meme-files generated by GLAM2.
Tomtom searches motifs in databases.
*/
process tomtom {
		conda "${path_env}"

    tag{name}

    publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy'

    input:
    set name, file (meme), file (jaspar_db) from meme_for_tomtom.combine(db_for_tomtom)

    output:
    set name, file ('*.tsv') into tsv_for_filter

    script:
    """
    tomtom ${meme} ${jaspar_db} -thresh ${params.tomtom_treshold} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
    """
}


//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter = meme_for_filter.join( tsv_for_filter )
for_filter
    .filter { name, meme, tsv ->
        long count = tsv.readLines().size()
        count <= 1
        }
    .into { meme_for_scan; check }


//If channel 'check' is empty print errormessage
process check_for_unknown_motifs {
    echo true

    input:
    val x from check.ifEmpty('EMPTY')

    when:
    x == 'EMPTY'

    """
    echo '>>> STOPPED: No unknown Motifs were found.'
    """

}


//Get the best(first) Motif from each MEME-file
process get_best_motif {
	conda "${path_env}"

	input:
	set name, file(meme), file(tsv) from meme_for_scan

	output:
	set name, file('*_best.meme') into best_motif

	script:
	"""
	python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme
	"""
}


best_motif.combine(fa_scan).set {files_for_genome_scan}

/*
process genome_scan {
	conda "${path_env}"

	input:
	set name, file(meme), file(fasta) from files_for_genome_scan

	output:
	file ('.bed') into bed_for_uropa, bed_for_cluster_quality

	script:
	"""
	"""
}


process cluster_quality {

	input:
	file (bed) from bed_for_cluster_quality

	output:
	file ('*.bed') into bed_for_final_filter

	script:
	"""
	"""
} */


process create_GTF {
	conda "${path_env}"

	publishDir 'Path', mode:'copy'

	output:
	file ('*.gtf') into gtf_for_uropa

	script:
	"""
	python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}
	"""
}

/*
bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}


// Create configuration file for UROPA.
// Takes template and replaces bed- and gtf-placeholders with actual paths.
process create_uropa_config {

	publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'

	input:
	set val(bed), val(gtf) from uropa_in.toList()
	file (conf) from config

	output:
	file ('uropa.config') into uropa_config

	script:
	"""
  	sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final
	"""
}


process UROPA {

	input:
	file (config) from uropa_config

	output:
	set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter

	script:
	"""
	"""
}


process filter {

	input:

	output:

	script:
	"""
	"""
} */