diff --git a/README.md b/README.md index 3df7827..ce23475 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ T2C Analysis: * raw/normalized * bed file with all interactions inside the target region * with and without uropa annotation -* easy to read viewpoint format for annotated interactions (see Results for explanation) +* easy to read viewpoint format for annotated interactions (see Results for explanation) * restriction maps for different runs on the genome HiC Analysis: @@ -31,7 +31,7 @@ Note: Green shows a value higher than the scale. ## Installation and Command-line usage ### Dependencies * Linux -* Nextflow version 0.27.6.4775 +* Nextflow version 0.30 * R version version 3.4.4 * ggplot2 * plyr @@ -45,12 +45,15 @@ Note: Green shows a value higher than the scale. * Python version 2.7.8 * pysam * getopt -* bowtie2 version 2.3.3.1 -* bwa version 0.7 -* SAMtools version 1.3.1 -* BEDtools version 2.27 -* uropa version 2.0.2 alpha * HiCExplorer version 2.1 +* Conda: + * bowtie2 version 2.3.3.1* + * bwa version 0.7* + * SAMtools version 1.3.1* + * BEDtools version 2.27* + * uropa version 2.0.2 alpha* + +``* Will be implemented automatically through conda enviroment.`` ### Installation To install Nextflow, follow the instructions on the offical Nextflow [website](https://www.nextflow.io/). @@ -148,7 +151,3 @@ nextflow run TOuCAN.nf --mode T2C --in /example/fastq/ --out /out/ --chr chr1 -- ## Results For a detailed explanation of all Results, follow this [link](https://github.molgen.mpg.de/loosolab/TOuCAN/wiki/TOuCAN-Results). - - - - diff --git a/TOuCAN.nf b/TOuCAN.nf index 4b294f7..a2a346b 100644 --- a/TOuCAN.nf +++ b/TOuCAN.nf @@ -9,6 +9,7 @@ params.safe_all_files = 0 params.check_res_maps = 0 params.organism = "mm10" params.pn = "Project" +params.reassin = 0 // parameter for T2C plots params.chr = "" @@ -103,10 +104,12 @@ Usage: nextflow run TOuCAN.nf --in [Input Path] --out [Output Path] --mode [Modi --score_min [INT] - Score range: minimum. [default: 0] --score_max [INT] - Score range: maximium. [default: autoscale] --pn [STRING] - Name of the Project [default: 'Project'] + --organsim [mm10,mm9,hg19] - Type of the genome --mode HiC - Full HiC analysis parameters: --in [PATH] - Path to directory with fastq / fastq.gz files. + --bam [PATH] - Path to directory with bam files. Two bam files (forward + reversed) are required per sample. [if given --in [PATH] is ignored] --out [PATH] - Path to output directory. --aln [bwa|bowtie2] - Choose alignment tool. [default: bwa] --bin [INT] - Binsize [default: 10000] @@ -194,8 +197,10 @@ multiplot_bed_1 = Channel.empty() matrix_to_plot = Channel.empty() res_map = Channel.empty() bam_f = Channel.empty() +bam_f2 = Channel.empty() interaction_uropa = Channel.empty() fastqGzFiles = Channel.empty() +hic_matrix_f = Channel.empty() if("${params.path_matrix}" != "" && params.mode == "plot"){ matrix_to_plot = Channel @@ -203,7 +208,6 @@ if("${params.path_matrix}" != "" && params.mode == "plot"){ .map {it -> [it.simpleName, it]} } - if(params.mode == "uropa"){ interaction_uropa = Channel .fromPath("${params.in}/*.normalized.bed") @@ -235,18 +239,23 @@ if (params.mode != "plot"){ } if ( "${params.bam}" != "" ){ - bam_f = Channel + Channel .fromPath("${params.bam}/*.bam") .map {it -> [it.simpleName, it]} + .into {bam_f; bam_f2} create_bam = false - } else { + } else if ("${params.mode}" == "HiC" & "${params.mode}" == "h5") { + hic_matrix_f = Channel + .fromPath("${params.in}/*.h5") + .map {it -> [it.simpleName, it]} + }else { create_bam = true fastqGzFiles = Channel.fromPath("${params.in}/*.fastq*") } } -modeList = ["T2C", "HiC", "help", "h", "plot", "uropa", "multiplot"] -alnList = ["bowtie2", "bwa", "histat"] +modeList = ["T2C", "HiC", "help", "h", "plot", "uropa", "multiplot", "h5"] +alnList = ["bowtie2", "bwa"] organismList = ["mm10", "mm9", "hg19"] @@ -347,45 +356,6 @@ process check_parameters { """ } -/*Checks if given Output directory exsists. -*If it exists it will be deleted or die Pipeline stops. (HiC Mode only) -*/ -process check_output_directory { - - when: - params.mode == 'not_used' - - script: - valIn = false - if(outfile.exists()){ - while(!valIn){ - answer = System.console().readLine 'Output directory "' + outfile + '" already exists: delete directory? (Y/n) \n' - if(answer == 'n'){ - println("WARNING: It is required to delete given directory or choose a differnt directory!\nExiting...") - System.exit(0) - } else if (answer == 'y' | answer == '') { - valIn = true - println("Deleting directory...") - } else { - println("Error: Invalid input!") - } - } - } - - if(valIn){ - """ - if [ -d $outpath ]; then - rm -r ${outpath} - fi - echo 'Check of output directory done' - """ - } else { - """ - echo 'Check of output directory done' - """ - } -} - /* * T2C Step 1 *Creates restriction maps needed for T2C analysis @@ -398,7 +368,7 @@ process create_restriction_maps_for_T2C { file ("*.bed") into restriction_maps when: - params.mode == 'T2C' + params.mode == 'T2C' || params.mode == 'multiplot' create_res_map == true script: @@ -432,7 +402,7 @@ process create_restriction_maps_for_T2C { """ } -res_map.concat(restriction_maps).into {final_res_map_for_check; final_res_map; final_res_map_for_intersection} +res_map.concat(restriction_maps).into {final_res_map_for_check; final_res_map; final_res_map_for_intersection; res_map_for_ifm} process check_restriction_maps { @@ -870,6 +840,7 @@ process finish_raw_matrix { output: //set name_raw, file ("${name}.matrix.bed") into raw_matrix_bed_for_plots set name, file ("${name}.matrix.bed") into raw_matrix_bed_to_reassign + set name, file ("${name}.matrix.bed") into raw_matrix_for_norm when: params.mode == "T2C" @@ -904,6 +875,7 @@ process split_matrix_for_re_evaluation { when: params.mode == "T2C" + params.reassin == 1 script: """ @@ -920,6 +892,7 @@ process split_matrix_for_re_evaluation { process intersect_beds_to_fragments { tag{name} + conda "$workflow.projectDir/envtoucan.yaml" if(params.safe_all_files == 1){ publishDir "${outpath}/${params.pn}_additional_files/bed/", mode: 'copy' @@ -933,11 +906,12 @@ process intersect_beds_to_fragments { when: params.mode == "T2C" + params.reassin == 1 script: """ - sed 's/[[:blank:]]*\$\$//p' ${a_bed} | ${path_bedtools}/bedtools intersect -a ${a_bed} -b ${res_frag} -wao > ${name}.frga.bed - sed 's/[[:blank:]]*\$\$//p' ${b_bed} | ${path_bedtools}/bedtools intersect -a ${b_bed} -b ${res_frag} -wao > ${name}.frgb.bed + sed 's/[[:blank:]]*\$\$//p' ${a_bed} | bedtools intersect -a ${a_bed} -b ${res_frag} -wao > ${name}.frga.bed + sed 's/[[:blank:]]*\$\$//p' ${b_bed} | bedtools intersect -a ${b_bed} -b ${res_frag} -wao > ${name}.frgb.bed """ } @@ -961,6 +935,7 @@ process reassign_matrix { when: params.mode == "T2C" + params.reassin == 1 script: """ @@ -990,6 +965,7 @@ process finalize_matrix { when: params.mode == "T2C" + params.reassin == 1 script: name_raw = "${name}_raw" @@ -1000,6 +976,11 @@ process finalize_matrix { """ } +if (params.reassin == 1){ + matrix = reassigned_matrix +} else { + matrix = raw_matrix_for_norm +} /* * T2C step 17 * Normalize Matrix @@ -1011,7 +992,7 @@ process normalize_matrix { publishDir "${outpath}/${params.pn}_Interactions/02_all_interactions_bed_norm/", mode: 'copy' input: - set name, file (matrix) from reassigned_matrix + set name, file (matrix) from matrix output: set name, file ("${name}.normalized.bed") into normalized_matrix, multiplot_bed_2, filtered_interaction_bed_for_multiplot_2 @@ -1157,8 +1138,8 @@ process create_uropa_config_1 { \\"feature\\" : ${conf_list_final[0]}, \ \\"feature.anchor\\" : ${conf_list_final[1]}, \ \\"distance\\" : [ \ - ${uropa_dist_1}, \ - ${uropa_dist_2} \ + ${params.uropa_dist_1}, \ + ${params.uropa_dist_2} \ ], \ \\"strand\\" : ${conf_list_final[2]}, \ \\"direction\\" : ${conf_list_final[3]}, \ @@ -1255,8 +1236,8 @@ process create_uropa_config_2 { \\"feature\\" : ${conf_list_final[0]}, \ \\"feature.anchor\\" : ${conf_list_final[1]}, \ \\"distance\\" : [ \ - ${uropa_dist_1}, \ - ${uropa_dist_2} \ + ${params.uropa_dist_1}, \ + ${params.uropa_dist_2} \ ], \ \\"strand\\" : ${conf_list_final[2]}, \ \\"direction\\" : ${conf_list_final[3]}, \ @@ -1378,13 +1359,12 @@ process create_viewpoint { process convert_to_IFMatrix { tag{name} + echo true - if(params.safe_all_files == 1){ - publishDir "${outpath}/${params.pn}_additional_files/IFMatrix/", mode: 'copy' - } + publishDir "${outpath}/${params.pn}_additional_files/IFMatrix/", mode: 'copy' input: - set name, file (int_bed) from to_ifmatrix + set name, file (int_bed), file (res) from to_ifmatrix.combine(res_map_for_ifm.filter{ it.simpleName == "restriction_targets"}) output: set name, file ("${name}.ifm.txt") into ifmatrix @@ -1394,7 +1374,7 @@ process convert_to_IFMatrix { script: """ - ${path_R}/Rscript ${path_bin}/bin/convert_to_IFMatrix.R ${int_bed} ${name}.ifm.txt + ${path_R}/Rscript ${path_bin}/bin/convert_to_IFMatrix.R ${int_bed} ${res} ${name}.ifm.txt ${params.chr} ${params.start} ${params.end} """ } @@ -1415,20 +1395,23 @@ process run_robusTAD { params.mode == "T2C" || params.mode == "multiplot" script: - """ - ${path_R}/Rscript ${path_bin}/bin/RobusTAD.R -i ${tfm} --header FALSE -n norm - """ + """ + ${path_R}/Rscript ${path_bin}/bin/RobusTAD.R -i ${tfm} --header FALSE -n norm --minWin 1 --maxWin 500 + """ } +//${path_R}/Rscript ${path_bin}/bin/RobusTAD.R -i ${tfm} --header FALSE -n norm --minWin 1 --maxWin 500 process x_axis_scaling { tag{name} + publishDir "${outpath}/${params.pn}_TAD/scaled/", mode: 'copy' + input: set name, file (tad) from tadbc output: - set name, file ("${bn}.txt") into tadx, multiplot_TAD + set name, file ("${bn}_scaled.txt") into tadx, multiplot_TAD when: params.mode == "T2C" || params.mode == "multiplot" @@ -1436,7 +1419,7 @@ process x_axis_scaling { script: bn = tad.baseName """ - ${path_R}/Rscript ${path_bin}/bin/xaxis_scaling.R ${tad} ${params.start} ${params.end} ${bn}.txt + ${path_R}/Rscript ${path_bin}/bin/xaxis_scaling.R ${tad} ${params.start} ${params.end} ${bn}_scaled.txt """ } @@ -1525,39 +1508,10 @@ process pptx_writer { //================================= HiC ======================================== -/* -process create_restriction_maps_for_HiC { - - - output: - file ("rest_site_${params.enzyme_a_name}.bed") into restriction_map_for_hic - - when: - params.mode == "HiC" - - script: - """ - findRestSite --fasta ${path_genome} --searchPattern ${params.enzyme_a_sequence} -o rest_site_${params.enzyme_a_name}.bed - """ -} */ - -/* -if(params.aln == "bwa"){ - bowtie2_alignment = Channel.empty() - //histat_alignment = Channel.empty() -} else if (params.aln == "bowtie2") { - //histat_alignment = Channel.empty() - bwa_alignment = Channel.empty() -} else if (params.mode == "histat") { - bowtie2_alignment = Channel.empty() - bwa_alignment = Channel.empty() -} */ - - process alignment_with_bwa { tag{fastq} - publishDir "${outpath}/bam/", mode: 'copy' + publishDir "${outpath}/${params.pn}_bam/", mode: 'copy' conda "$workflow.projectDir/envtoucan.yaml" input: @@ -1569,13 +1523,14 @@ process alignment_with_bwa { when: params.mode == "HiC" && params.aln == "bwa" + create_bam == true script: """ bwa mem \ ${params.bwa_HiC_options} \ ${path_genome} \ - $fastq | ${path_samtools}/samtools view -Shb > ${fastqName}.bam + $fastq | samtools view -Shb > ${fastqName}.bam """ } @@ -1586,7 +1541,7 @@ process bowtie2_index { conda "$workflow.projectDir/envtoucan.yaml" when: - params.mode == "HiC" && params.aln == "bowtie2" + params.mode == "HiC" && params.aln == "bowtie2" && create_bam == true output: val ("${genome_name}") into index_bt2 @@ -1603,7 +1558,7 @@ process bowtie2_index { process alignment_with_bowtie2 { tag{fastq} - publishDir "${outpath}/bam/", mode: 'copy' + publishDir "${outpath}/${params.pn}_bam/", mode: 'copy' conda "$workflow.projectDir/envtoucan.yaml" input: @@ -1614,7 +1569,7 @@ process alignment_with_bowtie2 { set basisname, file ("${fastqName}.bam") into bowtie2_alignment when: - params.mode == "HiC" && params.aln == "bowtie2" + params.mode == "HiC" && params.aln == "bowtie2" && create_bam == true script: """ @@ -1623,12 +1578,12 @@ process alignment_with_bowtie2 { """ } -hic_alignment = bwa_alignment.concat(bowtie2_alignment) //.concat(histat_alignment) +hic_alignment = bam_f2.concat(bwa_alignment.concat(bowtie2_alignment)) //.concat(histat_alignment) process create_HiC_matrix { tag{name} - publishDir "${outpath}/h5_matrix/", mode: 'copy' + publishDir "${outpath}/${params.pn}_h5_matrix/", mode: 'copy' input: set name, bams from hic_alignment.groupTuple() @@ -1664,7 +1619,8 @@ process create_HiC_matrix { process diagnostic_plot_of_HiC_Matrix { - publishDir "${outpath}/diagnostic_plots/", mode: 'copy' + publishDir "${outpath}/${params.pn}_diagnostic_plots/", mode: 'copy' + tag{name} input: set name, file (matrix) from hic_matrix_for_diagnostic @@ -1681,26 +1637,34 @@ process diagnostic_plot_of_HiC_Matrix { """ } +hic_matrix_final = hic_matrix_f.concat(hic_matrix) + process correct_matrix { - publishDir "${outpath}/matrix/", mode: 'copy' + tag{name} + publishDir "${outpath}/${params.pn}_h5_matrix/", mode: 'copy' input: - set name, file (matrix) from hic_matrix + set name, file (matrix) from hic_matrix_final output: set name, file("${name}_corrected.h5") into hic_matrix_corrected + when: + params.mode == "HiC" || params.mode == "h5" + script: """ - hicCorrectMatrix correct -m ${matrix} --filterThreshold ${params.hic_thresh_min} ${params.hic_thresh_max} -o ${name}_corrected.h5 + hicCorrectMatrix correct -m ${matrix} --filterThreshold \ + ${params.hic_thresh_min} ${params.hic_thresh_max} -o ${name}_corrected.h5 """ } process plot_matrix { - publishDir "${outpath}/plot/", mode: 'copy' + tag{name} + publishDir "${outpath}/${params.pn}_plot/", mode: 'copy' input: set name, file (corrected_matrix) from hic_matrix_corrected @@ -1712,8 +1676,14 @@ process plot_matrix { params.mode == "HiC" script: - """ - hicPlotMatrix -m ${corrected_matrix} -o ${name}.hic.matrix.png \ - --dpi 400 --region ${params.chr}:${params.start}-${params.end} - """ + if ((params.chr != "") && (params.start != 0) && p(arams.end != 0)) + """ + hicPlotMatrix -m ${corrected_matrix} -o ${name}.hic.matrix.png \ + --dpi 400 --region ${params.chr}:${params.start}-${params.end} + """ + else + """ + hicPlotMatrix -m ${corrected_matrix} -o ${name}.hic.matrix.png \ + --dpi 400 + """ } diff --git a/bin/convert_to_IFMatrix.R b/bin/convert_to_IFMatrix.R index 4e9a9cd..b16cfeb 100644 --- a/bin/convert_to_IFMatrix.R +++ b/bin/convert_to_IFMatrix.R @@ -2,23 +2,33 @@ args = commandArgs(trailingOnly=TRUE) path_interaction.bed = args[1] #"G://Rene.Wiegandt/01_CHiC/04_results/Pipeline_final_results/MyoG_interaction_bed/I17-1299-02-C2C12_Undiff_MyoG_S10_L001_LN405_001.interactions.bed" -out = args[2] #"G://Rene.Wiegandt/larp.txt" +res_map = args[2] #"G://Rene.Wiegandt/01_restriction_maps/restriction_targets.bed" +out = args[3] #"G://Rene.Wiegandt/larp.txt" +chr <- args[4] +start <- args[5] +end <- args[6] interaction.bed <- data.table::fread(path_interaction.bed, header = FALSE, sep = "\t") +map <- data.table::fread(res_map, header = FALSE, sep = "\t") + +region_pairs <- subset(map, (V1 == chr & ((start <= V2 & V2 <= end) ))) x <- unique(interaction.bed[,V3]) y <- unique(interaction.bed[,V6]) -bindvec <- sort(unique(as.vector(append(x,y)))) -sizeMatrix <- length(bindvec) +sizeMatrix <- length(region_pairs$V3) TADmatrix <- matrix(nrow = sizeMatrix,ncol = sizeMatrix) -dimnames(TADmatrix) = list(bindvec, bindvec) +dimnames(TADmatrix) = list(region_pairs$V3 - 1 , region_pairs$V3 - 1) for (i in 1:nrow(interaction.bed)) { x_name <- as.character(interaction.bed[i,3]) y_name <- as.character(interaction.bed[i,6]) - TADmatrix[x_name, y_name ] <- as.double(interaction.bed[i,8]) + if ((as.numeric(x_name) %in% region_pairs$V3 -1) & (as.numeric(y_name) %in% region_pairs$V3 - 1 )) { + TADmatrix[x_name, y_name ] <- as.double(interaction.bed[i,8]) + } else { + TADmatrix[as.character(as.numeric(x_name) + 1), as.character(as.numeric(y_name) + 1) ] <- as.double(interaction.bed[i,8]) + } } tadFrame <- data.table::data.table(TADmatrix)