From 16110f50709239705bd36e76f31575bed37f1eea Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:28:07 +0100 Subject: [PATCH 001/158] Add files via upload --- compareBed.sh | 183 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 compareBed.sh diff --git a/compareBed.sh b/compareBed.sh new file mode 100644 index 0000000..8e81434 --- /dev/null +++ b/compareBed.sh @@ -0,0 +1,183 @@ +#!/bin/bash +#data +#motifs +#workdir +#fasta +#min +#max +#output +wrong=() +da=false +mo=false +wo=false +fa=false +mi=false +ma=false +ou=false +he=false + +if [ $# -eq 0 ] +then + he=true +fi + +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + -d|--data) + data="$2" + da=true + shift + shift + ;; + -m|--motifs) + motifs="$2" + mo=true + shift + shift + ;; + -w|--workdir) + workdir="$2" + wo=true + shift + shift + ;; + -f|--fasta) + fasta="$2" + fa=true + shift + shift + ;; + -min|--min) + min=$2 + mi=true + shift + shift + ;; + -max|--max) + max=$2 + ma=true + shift + shift + ;; + -o|--output) + output="$2" + ou=true + shift + shift + ;; + -h|--help) + help=true + he=true + shift + ;; + *) + wrong+=("$1") + shift + ;; +esac +done + +count=${#wrong[@]} +if [ $count -gt 0 ] +then + for i in ${wrong[@]} + do + echo wrong parameter $i + echo call script without parameters for help or call --help + echo exit + done +exit 1 +fi + +if [ $he == true ] +then + echo "This script utilies bedtools to select new footprints from data." + echo "Therefore the data is compared with known footprint motifs." + echo "The output is a new .bed file with added sequence information." + echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided or an error will occur." + echo "--------------------" + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." + echo "--------------------" + echo " required parameter:" + echo " -d --data the path to the .bed file of the footprints" + echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -f --fasta the path to the .fasta file of genome" + echo " " + echo " optional parameter:" + echo " -w --workdir the path to directory where temporary files will be created" + echo " default is a new directory that is created in the current directory" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 60" + echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " -h --help shows this help message" +exit 0 +fi + +echo selected parameters +echo ------------------- +echo data: $da \(required\) +echo motifs: $mo \(required\) +echo workdir: $wo +echo fasta: $fa \(required\) +echo min: $mi +echo max: $ma +echo output: $ou +echo help: $he + +if [ $da == false ] || [ $mo == false ] || [ $fa == false ] +then + echo required parameters not given. + echo required are: --data \ --motifs \ --fasta \ + exit 1 +fi + +if [ $wo == false ] +then + if [ ! -d workdir1337 ] + then + mkdir workdir1337 + fi + wo=true + workdir="workdir1337" +fi + +if [ $ou == false ] +then + output="newFootprints.bed" + ou=true +fi + +if [ $mi == false ] +then + min=10 + mi=true +fi + +if [ $ma == false ] +then + max=60 + ma=true +fi +#1. first filter. no overlap vs. overlap +bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed +bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed + +#2. compute absolut maxscore position +Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed + +#3. subtract overlapping regions for additional motifs +bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +Rscript --vanilla merge.R $min $max "$workdir" + +#5. add fasta sequences to bed and create fasta file +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" +bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta + +#6 clean up +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file From a9d0070e3141f273360608fa4d2cdd7264f75684 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:30:56 +0100 Subject: [PATCH 002/158] Add files via upload --- maxScore.R | 9 +++++++++ merge.R | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 maxScore.R create mode 100644 merge.R diff --git a/maxScore.R b/maxScore.R new file mode 100644 index 0000000..4ea545b --- /dev/null +++ b/maxScore.R @@ -0,0 +1,9 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +args = commandArgs(TRUE) +file = args[1] + +tab = read.table(file, header=FALSE) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +tab$maxpos = tab$start + tab$maxpos + +write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R new file mode 100644 index 0000000..bbe3c9a --- /dev/null +++ b/merge.R @@ -0,0 +1,23 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +args=commandArgs(TRUE) +min=as.numeric(args[1]) +max=as.numeric(args[2]) +folder=args[3] +splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +p1$maxpos = p1$start + p1$maxpos + +splitted=rbind(splitted, p1) + +splitted=splitted[which(splitted$stop - splitted$start >= min),] +splitted=splitted[which(splitted$stop - splitted$start <= max),] +splitted$id=make.unique(as.character(splitted$id)) +splitted$length=splitted$stop - splitted$start + +splitted=cbind(splitted, containsMaxpos=0) +splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 +splitted$maxpos = splitted$maxpos - splitted$start +write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') + From 26d276428a2d34c56ec135a7cc226710293081f7 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:49:28 +0100 Subject: [PATCH 003/158] Update README.md --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 811fa89..fcb5bb2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ -# masterJLU2018 +# compareBed.sh for filtering of new, yet unannotated motifs -de novo motif discovery and evaluation based on footprints identified by TOBIAS +The script mainly utilizes bedtools for comparison. +The sequences in the bed-file from the experiment are compared to a bed-file of sequences of known motifs. +Only sequences with no overlap with the known-motif sequences are selected in the first step. +Then the overlapping parts of overlapping sequences are also selected as new sequences. +The resulting bed-file contains the sequences selected in the two steps. The file has two extra columns appended: 1st the DNA sequence of the entry and 2nd a flag with possible values '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. + +Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. + +For usage, run ./compareBed.sh + +dependencies: +- R +- From 6fd9948f4e81d7180033ec7ada7630efe6e3d6c4 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:57:22 +0100 Subject: [PATCH 004/158] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fcb5bb2..4dc88e2 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ # compareBed.sh for filtering of new, yet unannotated motifs The script mainly utilizes bedtools for comparison. -The sequences in the bed-file from the experiment are compared to a bed-file of sequences of known motifs. +The sequences in the bed-file of the experiment are compared with sequences of known motifs, also in bed-format. Only sequences with no overlap with the known-motif sequences are selected in the first step. Then the overlapping parts of overlapping sequences are also selected as new sequences. -The resulting bed-file contains the sequences selected in the two steps. The file has two extra columns appended: 1st the DNA sequence of the entry and 2nd a flag with possible values '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. +The resulting bed-file contains the sequences selected in both steps. The output file has two extra columns appended: 1st, the DNA sequence of the entry, and 2nd, a flag with possible values of '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. -Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. +Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. The bed-file with known motif sequences can be calculated with the tool motifscan.py. For usage, run ./compareBed.sh dependencies: -- R -- +- R ?version +- bedtools ?version From 2fdbf0da858b2fd20c10511c2de052c9db10bcdb Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 28 Nov 2018 15:34:45 +0100 Subject: [PATCH 005/158] update, working with unmerged motif files now --- compareBed.sh | 84 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 8 deletions(-) diff --git a/compareBed.sh b/compareBed.sh index 8e81434..92c55d7 100644 --- a/compareBed.sh +++ b/compareBed.sh @@ -104,14 +104,14 @@ then echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -m --motifs the path to the directory where the .bed files of the scanned motifs are stored" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" echo " default is a new directory that is created in the current directory" echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 60" + echo " -max --max maximum size of footprints; default is 80" echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" echo " -h --help shows this help message" exit 0 @@ -159,18 +159,65 @@ fi if [ $ma == false ] then - max=60 + max=80 ma=true fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + #1. first filter. no overlap vs. overlap -bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l +echo calc maxScore #2. compute absolut maxscore position Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed -#3. subtract overlapping regions for additional motifs -bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed +#3 subtract overlapping regions for additional motifs +echo "also get subsequences with no overlap of overlapping sequences" +help=true +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done + +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition Rscript --vanilla merge.R $min $max "$workdir" @@ -180,4 +227,25 @@ bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$o bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed + + +#1. first filter. no overlap vs. overlap +#bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed +#bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed + +#2. compute absolut maxscore position +#Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed + +#3. subtract overlapping regions for additional motifs +#bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +#Rscript --vanilla merge.R $min $max "$workdir" + +#5. add fasta sequences to bed and create fasta file +#bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" +#bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta + +#6 clean up +#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed From 902882004278c0029ff9c15b6a0dd1761e2a4256 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 29 Nov 2018 13:46:28 +0100 Subject: [PATCH 006/158] update, bugfix output parameter error --- compareBed.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compareBed.sh b/compareBed.sh index 92c55d7..279c9e3 100644 --- a/compareBed.sh +++ b/compareBed.sh @@ -147,7 +147,7 @@ fi if [ $ou == false ] then - output="newFootprints.bed" + output=${workdir}/"newFootprints.bed" ou=true fi @@ -194,7 +194,7 @@ cat "$workdir"/pass1Fa.bed | wc -l echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" @@ -220,14 +220,14 @@ else fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla merge.R $min $max "$workdir" +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed #1. first filter. no overlap vs. overlap From f2344b6c74329a6a26b22135e6d0996ec9bfff44 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 10:13:46 +0100 Subject: [PATCH 007/158] add bed sequence reduction script --- bin/reduce_bed.R | 237 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 bin/reduce_bed.R diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R new file mode 100644 index 0000000..cfa9766 --- /dev/null +++ b/bin/reduce_bed.R @@ -0,0 +1,237 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer") + # TODO more args +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Reduce sequences to frequent regions.") + +opt <- parse_args(opt_parser) + +#' Reduce bed file to conserved regions +#' +#' @param input bed file +#' @param kmer Length of kmer. +#' @param motif Estimated motif length. +#' @param output Output file +#' @param threads Number of threads. +#' @param clean Delete all temporary files. +#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO +#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO +#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif. +#' +#' @return reduced bed +#' TODO check whether jellyfish is installed +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) { + # get number of available cores + if (threads == -1) { + threads <- parallel::detectCores() + } + + message("Loading bed...") + # load bed + # columns: chr, start, end, name, ..., sequence + bed_table <- data.table::fread(input = input, header = FALSE) + names(bed_table)[1:4] <- c("chr", "start", "end", "name") + names(bed_table)[ncol(bed_table)] <- "sequence" + # index + data.table::setkey(bed_table, name, physical = FALSE) + + # check for duplicated names + if (anyDuplicated(bed_table[, "name"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, name := make.unique(name)] + } + + # remove sequences below minimum length + total_rows <- nrow(bed_table) + bed_table <- bed_table[nchar(sequence) < min_seq_length] + if (total_rows > nrow(bed_table)) { + message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) + } + + # TODO forward fasta file as parameter so no bed -> fasta conversion is needed. + message("Writing fasta...") + # save as fasta + fasta_file <- paste0(basename(input), ".fasta") + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + message("Counting kmer...") + # count k-mer + hashsize <- 4 ^ kmer + count_output_binary <- "mer_count_binary.jf" + input <- fasta_file + jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input) + + system(command = jellyfish_call, wait = TRUE) + + mer_count_table <- "mer_count_table.jf" + jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary) + + system(command = jellyfish_dump_call, wait = TRUE) + + message("Reduce kmer.") + # load mer table + # columns: kmer, count + kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) + # order kmer descending + data.table::setorder(kmer_counts, -V2) + + # compute number of hits to keep + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif) + + # reduce kmer + reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits) + + message("Find kmer in sequences.") + # find k-mer in sequences + # TODO minoverlap as parameter + # columns: name, start, end, width + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads) + names(ranges_table)[1:2] <- c("relative_start", "relative_end") + + # merge ranged_table with bed_table + keep column order + merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE] + + # delete sequences without hit + merged <- na.omit(merged, cols = c("relative_start", "relative_end")) + message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.") + + message("Reduce sequences.") + # create subsequences + merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] + + # bed files count from 0 + merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + # change start end location + merged[, `:=`(start = start + relative_start, end = start + relative_end)] + + # clean table + merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + + if (clean) { + file.remove(fasta_file, count_output_binary, mer_count_table) + } + + data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) +} + +#' returns sum of top x kmer frequencies to keep +#' +#' @param bed Bed table with sequences in last column +#' @param kmer Length of kmer +#' @param motif Length of motif +#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { + if (minoverlap > kmer || minoverlap > motif) { + stop("Kmer & motif must be greater or equal than minoverlap!") + } + + # minimum sequence length to get all interesting overlaps + min_seq_length <- motif + 2 * (kmer - minoverlap) + + seq_lengths <- nchar(bed[[ncol(bed)]]) + + # reduce to max interesting length + seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) + + # calculate max possible kmer + topx <- sum(seq_lengths - kmer + 1) + + return(topx) +} + +#' @param kmer Kmer table +#' @param significant +reduce_kmer <- function(kmer, significant) { + kmer[, cumsum := cumsum(V2)] + + return(kmer[cumsum <= significant]) +} + +#' create list of significant ranges (one for each bed entry) +#' +#' @param bed Data.table of bed with sequence in last column +#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count. +#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length. +#' @param threads Number of threads. +#' +#' @return Data.table with relative positions and width (start, end, width). +find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { + if (nchar(kmer_counts[1, 1]) <= minoverlap) { + stop("Minoverlap must be smaller than kmer length!") + } + + names(kmer_counts)[1:2] <- c("kmer", "count") + data.table::setorder(kmer_counts, -count) + + seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { + seq <- bed[x][[ncol(bed)]] + + #### locate ranges + ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) + + ranges <- na.omit(ranges, cols = c("start", "end")) + + if (nrow(ranges) < 1) { + return(data.table::data.table(start = NA, end = NA, width = NA)) + } + + # add kmer sequences + ranges[, sub_seq := stringr::str_sub(seq, start, end)] + # add kmer count + ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] + + #### reduce ranges + reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]]) + + # list of overlapping ranges + edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE)) + + # get components (groups of connected ranges) + graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE) + # vector of node membership (indices correspond to ranges above) + member <- as.factor(igraph::components(graph)$membership) + + # list of membership vectors + node_membership <- lapply(levels(member), function(x) { + which(member == x) + }) + + # calculate component score (= sum of kmer count) + score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { + sum(kmer_counts[x, "count"]) + }) + + selected_ranges <- node_membership[[which(score == max(score))[1]]] + + # reduce selected ranges + reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) + + reduced_ranges <- data.table::as.data.table(reduced_ranges) + + return(reduced_ranges) + }) + + # create ranges table + conserved_regions_table <- data.table::rbindlist(seq_ranges) + conserved_regions_table[, name := bed[[4]]] + + return(conserved_regions_table) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(reduce_bed, args = params) +} From ab887422a29ce13da8c95039d851c68cbd5edf48 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 10:14:22 +0100 Subject: [PATCH 008/158] add cdhit wrapper script --- bin/cdhit_wrapper.R | 75 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 bin/cdhit_wrapper.R diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R new file mode 100644 index 0000000..fd2ce49 --- /dev/null +++ b/bin/cdhit_wrapper.R @@ -0,0 +1,75 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") + # TODO more args +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Reduce sequences to frequent regions.") + +opt <- parse_args(opt_parser) + +#' cd-hit wrapper +#' +#' @param input +#' @param similarity cdhit = -c +#' @param coverage In Nucleotides. cdhit = -A +#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). +#' @param clean Clean up after run. +#' +#' @return bed_table with cluster in last column +#' TODO add all cdhit parameter +#' TODO check whether cdhit is installed +cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { + # load bed if neccessary + if (!data.table::is.data.table(input)) { + bed_table <- data.table::fread(input = input, header = FALSE) + } else { + bed_table <- input + } + + ### convert bed to fasta + # 4th column = name + # last column = sequence + fasta_file <- "converted_bed.fasta" + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + ### cd-hit-est + cdhit_output <- "cdhit_output" + cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1") + + system(command = cdhit_call, wait = TRUE) + + # reformat cluster file + # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov + cluster_file <- "reformated.clstr" + cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) + + system(command = cluster_call, wait = TRUE) + + # load reformated file + cluster <- data.table::fread(cluster_file) + + ### add cluster to bed_table + result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] + + # delete files + if (clean) { + file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) + } + + data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(cdhitest, args = params) +} From 966761fd9af7d0568796874e67ffc6c86ced5860 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 11:35:46 +0100 Subject: [PATCH 009/158] fixed cdhit wrapper description --- bin/cdhit_wrapper.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index fd2ce49..070cb62 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -11,7 +11,7 @@ option_list <- list( ) opt_parser <- OptionParser(option_list = option_list, - description = "Reduce sequences to frequent regions.") + description = "CD-HIT-EST Wrapper function.") opt <- parse_args(opt_parser) From 96efc463da90b5f26c8ea9a27f7b4182617c076a Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 12:54:27 +0100 Subject: [PATCH 010/158] remove sequences below threshold (not above) --- bin/reduce_bed.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index cfa9766..d73fb94 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -54,7 +54,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr # remove sequences below minimum length total_rows <- nrow(bed_table) - bed_table <- bed_table[nchar(sequence) < min_seq_length] + bed_table <- bed_table[nchar(sequence) > min_seq_length] if (total_rows > nrow(bed_table)) { message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) } From 9bd64bebcc3b6327346f8f06a1f27e9142d44444 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 13:18:44 +0100 Subject: [PATCH 011/158] show progressbar even in non interactive context --- bin/reduce_bed.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index d73fb94..4590e44 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -231,6 +231,8 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) # call function with given parameter if not in interactive context (e.g. run from shell) if (!interactive()) { + # show apply progressbar + pbo <- pbapply::pboptions(type = "timer") # remove last parameter (help param) params <- opt[-length(opt)] do.call(reduce_bed, args = params) From c29c46def83bca43a5d7d9a566f9588d56306e78 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Tue, 4 Dec 2018 13:29:25 +0100 Subject: [PATCH 012/158] Delete .gitignore --- .gitignore | 171 ----------------------------------------------------- 1 file changed, 171 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 9f1b02e..0000000 --- a/.gitignore +++ /dev/null @@ -1,171 +0,0 @@ -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -/.idea/ -/EnsemblData/release-94/ -/UCSCData/hg38.bed -/UCSCData/mm10.bed -/results/homo_sapiens_filtered.gtf From 0b1c5c75924efa2e652f2822fe1f877c62d20e0c Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 13:34:39 +0100 Subject: [PATCH 013/158] Changed config locations --- bin/Modules/Ensembl/ActivityCategorizer.py | 2 +- bin/Modules/ucsc/ucsc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index 8fee867..fa204ae 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -32,7 +32,7 @@ def get_categorization(self): def read_config(self, organism): c_dict = {} - path_to_config = os.path.join("./config/celltypes_"+organism+".json") + path_to_config = os.path.join("../config/celltypes_"+organism+".json") with open(path_to_config) as input: data = json.loads(input.read()) for x in data: diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 4fbad78..9d10108 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -66,7 +66,7 @@ def get_organism_id(org): @staticmethod def get_activity_categories(organism): - path_to_config = os.path.join("./config/celltypes_" + organism + ".json") + path_to_config = os.path.join("../config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: data = json.loads(input_file.read()) From 1c22f750a47b5e8498dd21fce96237024055bbaf Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 13:38:01 +0100 Subject: [PATCH 014/158] Added Dev to new local branch --- .gitignore | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bac574a --- /dev/null +++ b/.gitignore @@ -0,0 +1,168 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/ + From 0529318c1a57f32b3c31dc932a6a882fa031988f Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 13:44:23 +0100 Subject: [PATCH 015/158] changed resultpath --- bin/Modules/SaveResults.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 2dddda7..e13aa6a 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -9,9 +9,9 @@ def __init__(self, results, organism, tissue): print("Save results to File !") self.path = "" if tissue: - self.path = os.path.join("./results/"+organism+"_filtered.gtf") + self.path = os.path.join("./"+organism+"_filtered.gtf") else: - self.path = os.path.join("./results/"+organism+".gtf") + self.path = os.path.join("./"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') From 8b2824e7a7977a33cf2a983bb9562adafe570fc4 Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 13:38:01 +0100 Subject: [PATCH 016/158] Added Dev to new local branch --- .gitignore | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bac574a --- /dev/null +++ b/.gitignore @@ -0,0 +1,168 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/ + From 6cd598556ef0e0037ed2c1e0bc379703ada5dfdc Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 13:46:10 +0100 Subject: [PATCH 017/158] added documentation --- bin/cdhit_wrapper.R | 6 +++--- bin/reduce_bed.R | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index 070cb62..ed5ea7f 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -18,8 +18,8 @@ opt <- parse_args(opt_parser) #' cd-hit wrapper #' #' @param input -#' @param similarity cdhit = -c -#' @param coverage In Nucleotides. cdhit = -A +#' @param similarity Similarity threshold. +#' @param coverage Minimal alignment length for both sequences in nucelotides. #' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). #' @param clean Clean up after run. #' @@ -42,7 +42,7 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1") + cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") system(command = cdhit_call, wait = TRUE) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index 4590e44..223c946 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -131,6 +131,8 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr #' @param kmer Length of kmer #' @param motif Length of motif #' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' +#' @return Number of interesting kmer. significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { if (minoverlap > kmer || minoverlap > motif) { stop("Kmer & motif must be greater or equal than minoverlap!") From 279c1963636d6910e8292d6484ee698271e41010 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Tue, 4 Dec 2018 13:47:16 +0100 Subject: [PATCH 018/158] added cluster dependencies --- masterenv.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/masterenv.yml b/masterenv.yml index 175f53a..72d2b64 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -12,3 +12,15 @@ dependencies: - r-seqinr - numpy - pybigWig +# cluster + - cd-hit + - jellyfish + - r-base>=3.5.1 + - r-data.table + # - r-seqinr + - r-pbapply + - r-igraph + - r-stringi + - r-stringr + - r-optparse + - bioconductor-iranges From 0a04f3019634d13a47d5d3e186e1fb5065d621cf Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 14:42:18 +0100 Subject: [PATCH 019/158] Changed Paths --- .gitignore | 168 ------------------ bin/Modules/Ensembl/ActivityCategorizer.py | 14 +- bin/Modules/Ensembl/ActivityTable.py | 4 +- bin/Modules/Ensembl/Ensembl.py | 18 +- .../Ensembl/FTPHandling/URLRetrieve.py | 2 +- .../Ensembl/FTPHandling/VersionChecker.py | 26 +-- bin/Modules/Ensembl/GTFGen.py | 8 +- bin/Modules/ucsc/ucsc.py | 26 +-- bin/RegGTFExtractor.py | 9 +- 9 files changed, 49 insertions(+), 226 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index bac574a..0000000 --- a/.gitignore +++ /dev/null @@ -1,168 +0,0 @@ -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/ - diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index fa204ae..e29ac4b 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -4,7 +4,7 @@ class ActivityCategorizer: - def __init__(self, release, organism): + def __init__(self, release, organism, wd): # List of all Folders with Activity Tables @@ -12,13 +12,13 @@ def __init__(self, release, organism): # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner] - self.c_dict = self.read_config(organism) + self.c_dict = self.read_config(organism, wd) # Activity table from all files as dict self.activity = {} - self.get_activity_data(release, organism) + self.get_activity_data(release, organism, wd) # Categorized Activity from Json-config print("Categorization: This may take a while") @@ -29,10 +29,10 @@ def __init__(self, release, organism): def get_categorization(self): return self.categorization - def read_config(self, organism): + def read_config(self, organism, wd): c_dict = {} - path_to_config = os.path.join("../config/celltypes_"+organism+".json") + path_to_config = os.path.join(wd, "../config/celltypes_"+organism+".json") with open(path_to_config) as input: data = json.loads(input.read()) for x in data: @@ -41,11 +41,11 @@ def read_config(self, organism): return c_dict - def get_activity_data(self, release, organism): + def get_activity_data(self, release, organism, wd): for folder in self.folderlist: # Generate path to binary File - file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin") + file = os.path.join(wd, "/EnsemblData", release, organism, "activity", folder, "table.bin") with open(file, "rb") as tables: self.activity[folder] = bytearray(tables.read()) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 042421a..105c351 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -15,8 +15,8 @@ class ActivityTable: 4, "activity=NA" """ - def __init__(self, organism, current_release): - self.link = os.path.join("./EnsemblData/", current_release, organism, "activity") + def __init__(self, organism, current_release, wd): + self.link = os.path.join(wd, "/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index df9cbc0..d80ce11 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -1,20 +1,20 @@ -from Modules.Ensembl.ActivityTable import ActivityTable -from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever -from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer -from Modules.Ensembl.GTFGen import GTFGen +from bin.Modules.Ensembl.ActivityTable import ActivityTable +from bin.Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever +from bin.Modules.Ensembl.ActivityCategorizer import ActivityCategorizer +from bin.Modules.Ensembl.GTFGen import GTFGen class Ensembl: - def __init__(self, organism): + def __init__(self, organism, wd): print("Starting Ensembl") - self.updater = FTPRetriever(organism) + self.updater = FTPRetriever(organism, wd) self.release = self.updater.get_release() - self.acttable = ActivityTable(organism, self.release) + self.acttable = ActivityTable(organism, self.release, wd) self.acttable.check_and_generate_activity_table() - self.categorizer = ActivityCategorizer(self.release, organism) + self.categorizer = ActivityCategorizer(self.release, organism, wd) print("Generating GTF") - self.gtf_generator = GTFGen(organism, self.release) + self.gtf_generator = GTFGen(organism, self.release, wd) print("Ensembl Finished !") diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 598a6f5..0c164d7 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -1,5 +1,5 @@ import ftplib -from Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry +from bin.Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry class FTPHandler: diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 874e776..6992dde 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -1,4 +1,4 @@ -from Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler +from bin.Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler import os.path @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever: And downloading newest version if necessary """ - def __init__(self, organism): + def __init__(self, organism, wd): self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() - self.localversion = self.get_current_local_version() - if self.check_version_difference(organism): - self.download_currentversion_version(self.remoteversion, organism) + self.localversion = self.get_current_local_version(wd) + if self.check_version_difference(organism, wd): + self.download_currentversion_version(self.remoteversion, organism, wd) else: print("Newest Version installed, no update needed.") @@ -31,15 +31,15 @@ def get_current_ftp_version(self): print("Current release is "+c_release) return c_release - def check_organism(self, organism, release): - if organism in next(os.walk("./EnsemblData/"+release+"/"))[1]: + def check_organism(self, organism, release, wd): + if organism in next(os.walk(os.path.join(wd, "/EnsemblData/"+release+"/")))[1]: return False else: print("No Local Version for "+organism+" installed. Installing...") return True - def get_current_local_version(self): - directories = next(os.walk("./EnsemblData/"))[1] + def get_current_local_version(self, wd): + directories = next(os.walk(os.path.join(wd, "/EnsemblData/")))[1] for dir in directories: if "release" in dir: localversion = sorted(directories, reverse=True)[0] @@ -51,7 +51,7 @@ def get_current_local_version(self): print("No Version installed !") return None - def check_version_difference(self, organism): + def check_version_difference(self, organism, wd): local_version = self.localversion remote_version = self.remoteversion @@ -64,16 +64,16 @@ def check_version_difference(self, organism): print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version) return True else: - if self.check_organism(organism, local_version): + if self.check_organism(organism, local_version, wd): return True else: return False - def download_currentversion_version(self, version, organism): + def download_currentversion_version(self, version, organism, wd): # Download Base File - targetfolder = os.path.join("./EnsemblData/", version, organism) + targetfolder = os.path.join(wd, "/EnsemblData/", version, organism) os.makedirs(targetfolder) folder_url = "/pub/"+version+"/regulation/"+organism+"/" self.site_ftp.change_dir(folder_url) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index a4de58a..381bef3 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -5,14 +5,14 @@ class GTFGen: - def __init__(self, organism, release): + def __init__(self, organism, release, wd): - self.gff_lines = self.get_organism_as_gff(organism, release) + self.gff_lines = self.get_organism_as_gff(organism, release, wd) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} - def get_organism_as_gff(self, organism, release): + def get_organism_as_gff(self, organism, release, wd): - directory = os.path.join("./EnsemblData/", release, organism) + directory = os.path.join(wd, "/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 9d10108..99788b7 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -7,18 +7,19 @@ class UcscGtf: - def __init__(self, org): + def __init__(self, org, wd): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = "./UCSCData/"+self.organism_id+".bed" + self.output = os.path.join(wd,"/UCSCData/"+self.organism_id+".bed") + self.path_to_bin = os.path.join(wd, "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") self.generate_gff_file() - self.ucsc_categories = self.get_activity_categories(org) + self.ucsc_categories = self.get_activity_categories(org, wd) self.gtf_lines = self.read_gff_to_gtf() print("UCSC finished !") def generate_gff_file(self): - callstring = ["./Modules/ucsc/bigBedToBed", self.link, self.output] + callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): @@ -65,8 +66,8 @@ def get_organism_id(org): return "mm10" @staticmethod - def get_activity_categories(organism): - path_to_config = os.path.join("../config/celltypes_" + organism + ".json") + def get_activity_categories(organism, wd): + path_to_config = os.path.join(wd, "/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: data = json.loads(input_file.read()) @@ -76,15 +77,4 @@ def get_activity_categories(organism): return categories def get_gtf(self): - return self.gtf_lines - - def test_save_to_file(self): - - with open("./results/test.gtf", "w") as file: - write_it = csv.writer(file, delimiter='\t') - for line in self.gtf_lines: - write_it.writerow(line) - - -#u = UcscGtf("homo_sapiens") -#u.test_save_to_file() \ No newline at end of file + return self.gtf_lines \ No newline at end of file diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 294e539..c3f669b 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -18,11 +18,11 @@ def check_for_local_folder(): os.mkdir( "./UCSCData" ) -def main_script(org, tissuetype=None): +def main_script(org, wd, tissuetype=None): check_for_local_folder() - ucsc = UcscGtf(org) - ense = Ensembl(org) + ucsc = UcscGtf(org, wd) + ense = Ensembl(org, wd) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) ResultSaver(unique_filter.get_results(), org, tissuetype) @@ -32,10 +32,11 @@ def main_script(org, tissuetype=None): parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', default=".", action='store', type=str) args = vars(parser.parse_args()) if args["organism"]: #print(args["tissue"]) - main_script(args["organism"], args["tissue"]) + main_script(args["organism"], args["wd"], args["tissue"]) else: print("No Arguments found -> See ./RegGTFExtractor.py -h for help.") From 49c356b55bdfe6a21de6097bf9126c2b1acb6c5f Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 14:43:54 +0100 Subject: [PATCH 020/158] Changed Paths --- bin/Modules/Ensembl/ActivityCategorizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index e29ac4b..d8dfa42 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -33,8 +33,8 @@ def read_config(self, organism, wd): c_dict = {} path_to_config = os.path.join(wd, "../config/celltypes_"+organism+".json") - with open(path_to_config) as input: - data = json.loads(input.read()) + with open(path_to_config) as input_file: + data = json.loads(input_file.read()) for x in data: c_dict[x["type"]] = x["alias_ensembl"] self.folderlist.extend(x["alias_ensembl"]) From b25f97c68b4e5d22cd9c07695fee007fea6ce30a Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 14:47:39 +0100 Subject: [PATCH 021/158] Changed Paths --- bin/Modules/Ensembl/ActivityCategorizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index d8dfa42..f1973c5 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -32,7 +32,7 @@ def get_categorization(self): def read_config(self, organism, wd): c_dict = {} - path_to_config = os.path.join(wd, "../config/celltypes_"+organism+".json") + path_to_config = os.path.join(wd, "/config/celltypes_"+organism+".json") with open(path_to_config) as input_file: data = json.loads(input_file.read()) for x in data: From afe9eb2bdb341b5c67219c3427ff879ba0b1f402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 4 Dec 2018 08:47:52 -0500 Subject: [PATCH 022/158] minor changes --- nextflow.config | 4 ---- pipeline.nf | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/nextflow.config b/nextflow.config index 6f44ba3..2e84338 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,9 +1,5 @@ createTimeout = 60 -params { - -} - env { path_bin = $workflow.projectDir/bin path_env = $workflow.projectDir/masterenv.yml diff --git a/pipeline.nf b/pipeline.nf index a8e15dc..d8e2669 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -12,6 +12,8 @@ process footprint_extraction { conda "${path_env}" tag{name} + publishDir '${out}', mode: 'copy', pattern: '*.bed' + publishDir '/mnt/agnerds/Rene.Wiegandt/log', mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in From a53aa722fea52ae5f1404cab62f987a0fe77a5f5 Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 14:51:17 +0100 Subject: [PATCH 023/158] Changed Paths --- bin/Modules/ucsc/ucsc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 99788b7..d48ff1a 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -10,7 +10,7 @@ class UcscGtf: def __init__(self, org, wd): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = os.path.join(wd,"/UCSCData/"+self.organism_id+".bed") + self.output = os.path.join(wd, "/UCSCData/"+self.organism_id+".bed") self.path_to_bin = os.path.join(wd, "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") self.generate_gff_file() From 1285f1a7c819338caab6465aedc2106015cf11aa Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 14:53:15 +0100 Subject: [PATCH 024/158] Changed Paths --- bin/RegGTFExtractor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index c3f669b..373f435 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -8,14 +8,14 @@ import os -def check_for_local_folder(): +def check_for_local_folder(wd): - if not os.path.isdir("./EnsemblData"): + if not os.path.isdir(os.path.join(wd, "/EnsemblData")): - os.mkdir("./EnsemblData") + os.mkdir(os.path.join(wd, "/EnsemblData")) - if not os.path.isdir( "./UCSCData" ): - os.mkdir( "./UCSCData" ) + if not os.path.isdir(os.path.join(wd, "/UCSCData" )): + os.mkdir(os.path.join(wd, "/UCSCData" )) def main_script(org, wd, tissuetype=None): From 512d262368692ccdd0acf23a075462a551612256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 4 Dec 2018 08:55:52 -0500 Subject: [PATCH 025/158] moved config for gtf creation --- {config => bin/config}/celltypes_homo_sapiens.json | 0 {config => bin/config}/celltypes_mus_musculus.json | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {config => bin/config}/celltypes_homo_sapiens.json (100%) rename {config => bin/config}/celltypes_mus_musculus.json (100%) diff --git a/config/celltypes_homo_sapiens.json b/bin/config/celltypes_homo_sapiens.json similarity index 100% rename from config/celltypes_homo_sapiens.json rename to bin/config/celltypes_homo_sapiens.json diff --git a/config/celltypes_mus_musculus.json b/bin/config/celltypes_mus_musculus.json similarity index 100% rename from config/celltypes_mus_musculus.json rename to bin/config/celltypes_mus_musculus.json From 4c3f0139e90993883f94b787c1a36721fc5eb3b8 Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:00:56 +0100 Subject: [PATCH 026/158] Fixed: "No Module named bin Error" --- bin/Modules/Ensembl/Ensembl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index d80ce11..f0a922c 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -1,7 +1,7 @@ -from bin.Modules.Ensembl.ActivityTable import ActivityTable -from bin.Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever -from bin.Modules.Ensembl.ActivityCategorizer import ActivityCategorizer -from bin.Modules.Ensembl.GTFGen import GTFGen +from Modules.Ensembl.ActivityTable import ActivityTable +from Modules.Ensembl.FTPHandling.VersionChecker import EnsemblRegulationFTPRetriever as FTPRetriever +from Modules.Ensembl.ActivityCategorizer import ActivityCategorizer +from Modules.Ensembl.GTFGen import GTFGen class Ensembl: From a0165a9e6cb645df35d4838acfcf8b94ac2d56bd Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:03:40 +0100 Subject: [PATCH 027/158] Fixed: "No Module named bin Error" V2 --- bin/Modules/Ensembl/FTPHandling/URLRetrieve.py | 2 +- bin/Modules/Ensembl/FTPHandling/VersionChecker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 0c164d7..598a6f5 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -1,5 +1,5 @@ import ftplib -from bin.Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry +from Modules.Ensembl.FTPHandling.FTPEntry import FTPEntry class FTPHandler: diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 6992dde..26c284d 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -1,4 +1,4 @@ -from bin.Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler +from Modules.Ensembl.FTPHandling.URLRetrieve import FTPHandler import os.path From 396e18354fb83d9dc7f6bee901f413ecde7e722d Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:05:19 +0100 Subject: [PATCH 028/158] Fixed: "No Module named bin Error" V2 --- bin/RegGTFExtractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 373f435..a99c708 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -20,7 +20,7 @@ def check_for_local_folder(wd): def main_script(org, wd, tissuetype=None): - check_for_local_folder() + check_for_local_folder(wd) ucsc = UcscGtf(org, wd) ense = Ensembl(org, wd) print("Getting Unique Results") From 2dee4f634d768d6aff344e0c1bf1b4bf1d84a146 Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:15:12 +0100 Subject: [PATCH 029/158] Fixed: "No Module named bin Error" V2 --- bin/Modules/ucsc/ucsc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index d48ff1a..1c29337 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -13,6 +13,7 @@ def __init__(self, org, wd): self.output = os.path.join(wd, "/UCSCData/"+self.organism_id+".bed") self.path_to_bin = os.path.join(wd, "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") + print("Path to Bin: " + self.path_to_bin) self.generate_gff_file() self.ucsc_categories = self.get_activity_categories(org, wd) self.gtf_lines = self.read_gff_to_gtf() From 38c9bae54f0463df5c5f59d333a5726c291119da Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:20:19 +0100 Subject: [PATCH 030/158] Fixed: "No Module named bin Error" V2 --- bin/Modules/SaveResults.py | 6 +++--- bin/RegGTFExtractor.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index e13aa6a..09c4955 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -4,14 +4,14 @@ class ResultSaver: - def __init__(self, results, organism, tissue): + def __init__(self, results, organism, tissue, wd): print("Save results to File !") self.path = "" if tissue: - self.path = os.path.join("./"+organism+"_filtered.gtf") + self.path = os.path.join(wd, "/"+organism+"_filtered.gtf") else: - self.path = os.path.join("./"+organism+".gtf") + self.path = os.path.join(wd, "./"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index a99c708..20f2828 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -25,17 +25,17 @@ def main_script(org, wd, tissuetype=None): ense = Ensembl(org, wd) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) - ResultSaver(unique_filter.get_results(), org, tissuetype) + ResultSaver(unique_filter.get_results(), org, tissuetype, wd) if __name__ == '__main__': parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) - parser.add_argument('--wd', help='Working directory. default: "."', default=".", action='store', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', action='store', nargs='*', type=str) args = vars(parser.parse_args()) if args["organism"]: - #print(args["tissue"]) + print("Working Dir: " + args["wd"]) main_script(args["organism"], args["wd"], args["tissue"]) else: print("No Arguments found -> See ./RegGTFExtractor.py -h for help.") From 8c843fb670623b941ad89aa72d60a68d40ef34f8 Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:24:03 +0100 Subject: [PATCH 031/158] Fixed: "No Module named bin Error" V2 --- bin/RegGTFExtractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 20f2828..27a6255 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -32,7 +32,7 @@ def main_script(org, wd, tissuetype=None): parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) - parser.add_argument('--wd', help='Working directory. default: "."', action='store', nargs='*', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str) args = vars(parser.parse_args()) if args["organism"]: print("Working Dir: " + args["wd"]) From 0f2822d840c6be29c088c1c58ecfa544a8f74819 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Tue, 4 Dec 2018 15:27:58 +0100 Subject: [PATCH 032/158] Update masterenv.yml --- masterenv.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/masterenv.yml b/masterenv.yml index 72d2b64..2c1879f 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -3,21 +3,16 @@ name: masterenv channels: - bioconda - conda-forge - - defaults - - r dependencies: - bedtools - python - - r-essentials - r-seqinr - numpy - pybigWig -# cluster - cd-hit - jellyfish - r-base>=3.5.1 - r-data.table - # - r-seqinr - r-pbapply - r-igraph - r-stringi From bbf21175f0e0a4b48b7ff138327b3d149c40dc3c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Tue, 4 Dec 2018 15:29:56 +0100 Subject: [PATCH 033/158] Update masterenv.yml --- masterenv.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/masterenv.yml b/masterenv.yml index 2c1879f..846cb71 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -9,6 +9,7 @@ dependencies: - r-seqinr - numpy - pybigWig + #clustering - cd-hit - jellyfish - r-base>=3.5.1 @@ -19,3 +20,4 @@ dependencies: - r-stringr - r-optparse - bioconductor-iranges +# From d22a0e130f321a0523a1f90023185177394c729b Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 4 Dec 2018 15:34:04 +0100 Subject: [PATCH 034/158] Fixed: "Paths" V2 --- bin/Modules/Ensembl/ActivityCategorizer.py | 4 ++-- bin/Modules/Ensembl/ActivityTable.py | 2 +- bin/Modules/Ensembl/FTPHandling/VersionChecker.py | 6 +++--- bin/Modules/Ensembl/GTFGen.py | 2 +- bin/Modules/SaveResults.py | 4 ++-- bin/Modules/ucsc/ucsc.py | 8 ++++---- bin/RegGTFExtractor.py | 8 ++++---- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index f1973c5..3ef0258 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -32,7 +32,7 @@ def get_categorization(self): def read_config(self, organism, wd): c_dict = {} - path_to_config = os.path.join(wd, "/config/celltypes_"+organism+".json") + path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json") with open(path_to_config) as input_file: data = json.loads(input_file.read()) for x in data: @@ -45,7 +45,7 @@ def get_activity_data(self, release, organism, wd): for folder in self.folderlist: # Generate path to binary File - file = os.path.join(wd, "/EnsemblData", release, organism, "activity", folder, "table.bin") + file = os.path.join(wd + "/EnsemblData", release, organism, "activity", folder, "table.bin") with open(file, "rb") as tables: self.activity[folder] = bytearray(tables.read()) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 105c351..34f4719 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -16,7 +16,7 @@ class ActivityTable: """ def __init__(self, organism, current_release, wd): - self.link = os.path.join(wd, "/EnsemblData/", current_release, organism, "activity") + self.link = os.path.join(wd + "/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 26c284d..9c3500d 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -32,14 +32,14 @@ def get_current_ftp_version(self): return c_release def check_organism(self, organism, release, wd): - if organism in next(os.walk(os.path.join(wd, "/EnsemblData/"+release+"/")))[1]: + if organism in next(os.walk(os.path.join(wd+"/EnsemblData/"+release+"/")))[1]: return False else: print("No Local Version for "+organism+" installed. Installing...") return True def get_current_local_version(self, wd): - directories = next(os.walk(os.path.join(wd, "/EnsemblData/")))[1] + directories = next(os.walk(os.path.join(wd+"/EnsemblData/")))[1] for dir in directories: if "release" in dir: localversion = sorted(directories, reverse=True)[0] @@ -73,7 +73,7 @@ def download_currentversion_version(self, version, organism, wd): # Download Base File - targetfolder = os.path.join(wd, "/EnsemblData/", version, organism) + targetfolder = os.path.join(wd+"/EnsemblData/", version, organism) os.makedirs(targetfolder) folder_url = "/pub/"+version+"/regulation/"+organism+"/" self.site_ftp.change_dir(folder_url) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index 381bef3..cd70f3b 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -12,7 +12,7 @@ def __init__(self, organism, release, wd): def get_organism_as_gff(self, organism, release, wd): - directory = os.path.join(wd, "/EnsemblData/", release, organism) + directory = os.path.join(wd + "/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 09c4955..a15ffc3 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -9,9 +9,9 @@ def __init__(self, results, organism, tissue, wd): print("Save results to File !") self.path = "" if tissue: - self.path = os.path.join(wd, "/"+organism+"_filtered.gtf") + self.path = os.path.join(wd+"/"+organism+"_filtered.gtf") else: - self.path = os.path.join(wd, "./"+organism+".gtf") + self.path = os.path.join(wd+"/"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 1c29337..49e44ad 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -10,8 +10,8 @@ class UcscGtf: def __init__(self, org, wd): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = os.path.join(wd, "/UCSCData/"+self.organism_id+".bed") - self.path_to_bin = os.path.join(wd, "/Modules/ucsc/bigBedToBed") + self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed") + self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") print("Path to Bin: " + self.path_to_bin) self.generate_gff_file() @@ -68,7 +68,7 @@ def get_organism_id(org): @staticmethod def get_activity_categories(organism, wd): - path_to_config = os.path.join(wd, "/config/celltypes_" + organism + ".json") + path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: data = json.loads(input_file.read()) @@ -78,4 +78,4 @@ def get_activity_categories(organism, wd): return categories def get_gtf(self): - return self.gtf_lines \ No newline at end of file + return self.gtf_lines diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 27a6255..0b09b51 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -10,12 +10,12 @@ def check_for_local_folder(wd): - if not os.path.isdir(os.path.join(wd, "/EnsemblData")): + if not os.path.isdir(os.path.join(wd+"/EnsemblData")): - os.mkdir(os.path.join(wd, "/EnsemblData")) + os.mkdir(os.path.join(wd+"/EnsemblData")) - if not os.path.isdir(os.path.join(wd, "/UCSCData" )): - os.mkdir(os.path.join(wd, "/UCSCData" )) + if not os.path.isdir(os.path.join(wd+"/UCSCData" )): + os.mkdir(os.path.join(wd+"/UCSCData" )) def main_script(org, wd, tissuetype=None): From 6bc50bec5a5f6f68cb62b68a46acb58ac35a1235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 4 Dec 2018 09:45:09 -0500 Subject: [PATCH 035/158] added clustering + reducing --- pipeline.nf | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index d8e2669..5ed2eac 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -51,15 +51,31 @@ process overlap_with_known_TFBS { conda "${path_env}" input: - set file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap + set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: - file ('*.bed') into bed_for_clustering + set name, file ('*.bed') into bed_for_reducing script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name_placeholder} -min ${params.min_size_fp} -max ${params.max_size_fp} + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} + """ +} + + +process reduce_bed { + conda "${path_env}" + + input: + set name, file (bed) from bed_for_reducing + + output: + set name, file ('*.bed') into bed_for_clustering + + script: + """ + Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} """ } @@ -68,13 +84,14 @@ process clustering { conda "${path_env}" input: - file (bed) from bed_for_clustering + set name, file (bed) from bed_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation script: """ + Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed """ } @@ -232,7 +249,7 @@ process create_GTF { script: """ - python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} + python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} """ } From 460752294fbedc350c4a4bfd6decdb378a5d9633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Tue, 4 Dec 2018 09:45:28 -0500 Subject: [PATCH 036/158] added parameter --- config/cluster.config | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config/cluster.config b/config/cluster.config index e69de29..3dce063 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -0,0 +1,7 @@ +params{ + sequence_coverage=8 + + kmer=10 + aprox_motif_len=10 + threads=1 +} From 24a5a063c0f5d5305f2857fb29229f5c73c78b35 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 09:49:48 +0100 Subject: [PATCH 037/158] fixed minimum sequence length and added check --- bin/reduce_bed.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index 223c946..ea2c017 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -8,7 +8,7 @@ option_list <- list( make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to motif length.", metavar = "integer", type = "integer") + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer") # TODO more args ) @@ -25,13 +25,13 @@ opt <- parse_args(opt_parser) #' @param output Output file #' @param threads Number of threads. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmers. # TODO -#' @param minoverlap_motif Minimum required overlap between motif and kmer. # TODO -#' @param min_seq_length Must be smaller or equal to kmer and motif. Default = motif. +#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges.# TODO +#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation.# TODO +#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = motif) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = max(c(motif, kmer))) { # get number of available cores if (threads == -1) { threads <- parallel::detectCores() @@ -53,6 +53,10 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr } # remove sequences below minimum length + if (min_seq_length < max(c(kmer, motif))) { + stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).") + } + total_rows <- nrow(bed_table) bed_table <- bed_table[nchar(sequence) > min_seq_length] if (total_rows > nrow(bed_table)) { From 689331e5f4cf7156f118db1f2aa5ebb3041eb817 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 10:12:45 +0100 Subject: [PATCH 038/158] implemented minoverlap_kmer & minoverlap_motif --- bin/reduce_bed.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index ea2c017..c0b1ae7 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -8,7 +8,9 @@ option_list <- list( make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer") + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), + make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer") # TODO more args ) @@ -25,13 +27,13 @@ opt <- parse_args(opt_parser) #' @param output Output file #' @param threads Number of threads. #' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges.# TODO -#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation.# TODO +#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 +#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) #' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer, minoverlap_motif, min_seq_length = max(c(motif, kmer))) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer))) { # get number of available cores if (threads == -1) { threads <- parallel::detectCores() @@ -91,16 +93,16 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::setorder(kmer_counts, -V2) # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif) + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif) # reduce kmer - reduced_kmer <- reduce_kmer(kmer = kmer_counts, keep_hits) + reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) message("Find kmer in sequences.") # find k-mer in sequences # TODO minoverlap as parameter # columns: name, start, end, width - ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = kmer - 1, threads = threads) + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) names(ranges_table)[1:2] <- c("relative_start", "relative_end") # merge ranged_table with bed_table + keep column order @@ -139,7 +141,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr #' @return Number of interesting kmer. significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { if (minoverlap > kmer || minoverlap > motif) { - stop("Kmer & motif must be greater or equal than minoverlap!") + stop("Kmer & motif must be greater or equal to minoverlap!") } # minimum sequence length to get all interesting overlaps From 617ff9f56b7fb3074e24ae4985ef8b8556081ea9 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 11:25:04 +0100 Subject: [PATCH 039/158] better docu; reorder data in reduce_kmer() --- bin/reduce_bed.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index c0b1ae7..d539399 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -100,7 +100,6 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr message("Find kmer in sequences.") # find k-mer in sequences - # TODO minoverlap as parameter # columns: name, start, end, width ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) names(ranges_table)[1:2] <- c("relative_start", "relative_end") @@ -131,7 +130,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) } -#' returns sum of top x kmer frequencies to keep +#' Predict how many interesting kmer are possible for the given data. #' #' @param bed Bed table with sequences in last column #' @param kmer Length of kmer @@ -158,9 +157,15 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) return(topx) } -#' @param kmer Kmer table -#' @param significant +#' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. +#' +#' @param kmer Kmer data.table columns: kmer, count +#' @param significant Value from significant_kmer function. +#' +#' @return reduced data.table reduce_kmer <- function(kmer, significant) { + data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) + kmer[, cumsum := cumsum(V2)] return(kmer[cumsum <= significant]) From 3de27010e46c9f704acdc44d7e38a6a7f4a0d3ef Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 5 Dec 2018 12:06:36 +0100 Subject: [PATCH 040/158] Rscript first line changes --- compareBed.sh | 63 ++++++++++++++++++++++++++++++++------------------- maxScore.R | 7 +++--- merge.R | 14 ++++++------ 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/compareBed.sh b/compareBed.sh index 279c9e3..2153b48 100644 --- a/compareBed.sh +++ b/compareBed.sh @@ -104,7 +104,8 @@ then echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the directory where the .bed files of the scanned motifs are stored" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" @@ -172,6 +173,9 @@ fi echo get sequences with no overlap cat $data > "$workdir"/pass1Tr.bed help=true + +if [ -d $"motifs" ] +then for i in "$motifs"/*.bed do if [ $help == true ] @@ -184,6 +188,22 @@ do fi echo $i done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi + if [ $help == false ] then cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed @@ -199,6 +219,9 @@ Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" help=true + +if [ -d $"motifs" ] +then for i in "$motifs"/*.bed do if [ $help == true ] @@ -211,6 +234,21 @@ do fi echo $i done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + if [ $help == false ] then @@ -227,25 +265,4 @@ bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed - - -#1. first filter. no overlap vs. overlap -#bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -#bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed - -#2. compute absolut maxscore position -#Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed - -#3. subtract overlapping regions for additional motifs -#bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed - -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -#Rscript --vanilla merge.R $min $max "$workdir" - -#5. add fasta sequences to bed and create fasta file -#bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -#bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta - -#6 clean up -#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/maxScore.R b/maxScore.R index 4ea545b..1dfed4b 100644 --- a/maxScore.R +++ b/maxScore.R @@ -1,9 +1,10 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args = commandArgs(TRUE) file = args[1] -tab = read.table(file, header=FALSE) +tab = fread(file, header=FALSE) colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") tab$maxpos = tab$start + tab$maxpos -write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R index bbe3c9a..f491cb9 100644 --- a/merge.R +++ b/merge.R @@ -1,12 +1,13 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -19,5 +20,4 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - +fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 527c4fa2e03686d487095f77387baa332255591d Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 5 Dec 2018 12:17:38 +0100 Subject: [PATCH 041/158] Delete README.md --- README.md | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 4dc88e2..0000000 --- a/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# compareBed.sh for filtering of new, yet unannotated motifs - -The script mainly utilizes bedtools for comparison. -The sequences in the bed-file of the experiment are compared with sequences of known motifs, also in bed-format. -Only sequences with no overlap with the known-motif sequences are selected in the first step. -Then the overlapping parts of overlapping sequences are also selected as new sequences. -The resulting bed-file contains the sequences selected in both steps. The output file has two extra columns appended: 1st, the DNA sequence of the entry, and 2nd, a flag with possible values of '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. - -Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. The bed-file with known motif sequences can be calculated with the tool motifscan.py. - -For usage, run ./compareBed.sh - -dependencies: -- R ?version -- bedtools ?version From f742731fe7d00d840c90c83dbeb06a8fee8c9795 Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 5 Dec 2018 13:11:11 +0100 Subject: [PATCH 042/158] Changed ResultPath --- bin/Modules/SaveResults.py | 4 ++-- bin/RegGTFExtractor.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index a15ffc3..60d01e0 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -9,9 +9,9 @@ def __init__(self, results, organism, tissue, wd): print("Save results to File !") self.path = "" if tissue: - self.path = os.path.join(wd+"/"+organism+"_filtered.gtf") + self.path = os.path.join("./"+organism+"_filtered.gtf") else: - self.path = os.path.join(wd+"/"+organism+".gtf") + self.path = os.path.join("./"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 0b09b51..bf65106 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -34,9 +34,10 @@ def main_script(org, wd, tissuetype=None): parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str) args = vars(parser.parse_args()) + print("Working Dir: " + args["wd"]) if args["organism"]: print("Working Dir: " + args["wd"]) main_script(args["organism"], args["wd"], args["tissue"]) else: - print("No Arguments found -> See ./RegGTFExtractor.py -h for help.") + print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") From f6942a5ac6430fc1620edd9e8246272536285923 Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 5 Dec 2018 13:53:16 +0100 Subject: [PATCH 043/158] Changed ResultPath --- bin/RegGTFExtractor.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index bf65106..49303ee 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -6,6 +6,7 @@ from Modules.Uniquifier import UniqueFilter from Modules.SaveResults import ResultSaver import os +import json def check_for_local_folder(wd): @@ -18,14 +19,38 @@ def check_for_local_folder(wd): os.mkdir(os.path.join(wd+"/UCSCData" )) +def check_filter(tissue_cmd, org, wd): + path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" ) + tissues_config = [] + if not tissue_cmd: + return False + with open(path_to_config) as input_file: + data = json.loads(input_file.read()) + for x in data: + tissues_config.append(x["type"]) + + if any(tissue in tissues_config for tissue in tissue_cmd): + return True + + else: + return False + + def main_script(org, wd, tissuetype=None): check_for_local_folder(wd) + if check_filter(tissuetype, org, wd): + tissues = tissuetype + print("Filter detected !") + else: + tissues = None + print("Filter not detected !") + ucsc = UcscGtf(org, wd) ense = Ensembl(org, wd) print("Getting Unique Results") - unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) - ResultSaver(unique_filter.get_results(), org, tissuetype, wd) + unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) + ResultSaver(unique_filter.get_results(), org, tissues, wd) if __name__ == '__main__': From c4b7a0d9d0626a298c6a15d80efdde82934f5bac Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 5 Dec 2018 13:58:30 +0100 Subject: [PATCH 044/158] Changed ResultPath --- bin/Modules/SaveResults.py | 2 +- bin/RegGTFExtractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 60d01e0..e13aa6a 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -4,7 +4,7 @@ class ResultSaver: - def __init__(self, results, organism, tissue, wd): + def __init__(self, results, organism, tissue): print("Save results to File !") self.path = "" diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 49303ee..e4657be 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -50,7 +50,7 @@ def main_script(org, wd, tissuetype=None): ense = Ensembl(org, wd) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) - ResultSaver(unique_filter.get_results(), org, tissues, wd) + ResultSaver(unique_filter.get_results(), org, tissues) if __name__ == '__main__': From a584817a1409eeda3aefb4acb93feb887c09210a Mon Sep 17 00:00:00 2001 From: Schultheis Date: Wed, 5 Dec 2018 15:10:57 +0100 Subject: [PATCH 045/158] added progress messages; thread param; documentation --- bin/cdhit_wrapper.R | 49 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index ed5ea7f..f26a71a 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -2,11 +2,34 @@ library("optparse") option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-s", "--similarity"), default = 0.8, help = "Similarity threshold. Default = %default", metavar = "double >= 0.8"), + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default", metavar = "double >= 0.8"), make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), - make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical") + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), + make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + # make_option(opt_str = c("-G", "--global"), default = FALSE), + # make_option(opt_str = c("-b", "--band_width"), default = 20), + # make_option(opt_str = c("-M", "--memory"), default = 800), + make_option(opt_str = c("-T", "--threads"), default = 1)#, + # make_option(opt_str = c("-n", "--word_length"), default = 3), + # make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5), + # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 + # make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0), + # make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999), + # make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0), + # make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999), + # make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0), + # make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999), + # make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0), + # make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 99999999), + # make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999), + # make_option(opt_str = c("-g", "--fast_cluster"), default = FALSE), + # make_option(opt_str = c("-r", "--ignore_strand"), default = TRUE), + # make_option(opt_str = c("--match"), default = 2), + # make_option(opt_str = c("--mismatch"), default = -2), + # make_option(opt_str = c("--gap"), default = -6), + # make_option(opt_str = c("--gap_ext"), default = -1), + # make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = TRUE) # TODO more args ) @@ -17,16 +40,17 @@ opt <- parse_args(opt_parser) #' cd-hit wrapper #' -#' @param input -#' @param similarity Similarity threshold. +#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). +#' @param identity Identity threshold. #' @param coverage Minimal alignment length for both sequences in nucelotides. #' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). #' @param clean Clean up after run. +#' @param threads Number of threads to use in cd-hit (0 = all cpus). #' -#' @return bed_table with cluster in last column -#' TODO add all cdhit parameter +#' TODO add all cdhit parameter (necessary ?) #' TODO check whether cdhit is installed -cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1) { + message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { bed_table <- data.table::fread(input = input, header = FALSE) @@ -34,18 +58,21 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be bed_table <- input } + message("Convert to fasta.") ### convert bed to fasta # 4th column = name # last column = sequence fasta_file <- "converted_bed.fasta" seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", similarity, "-A", coverage, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", identity, "-A", coverage, "-T ", threads, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") system(command = cdhit_call, wait = TRUE) + message("Formatting/ writing results.") # reformat cluster file # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov cluster_file <- "reformated.clstr" @@ -71,5 +98,5 @@ cdhitest <- function(input, similarity = 0.8, coverage = 8, output = "cluster.be if (!interactive()) { # remove last parameter (help param) params <- opt[-length(opt)] - do.call(cdhitest, args = params) + do.call(cdhitest, args = params) } From 014e40da396cfe9d49b5aa9707210f3c77150de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 09:21:50 -0500 Subject: [PATCH 046/158] moved scripts to bin --- bin/compareBed.sh | 117 +++++++++++++++++--- bin/maxScore.R | 7 +- bin/merge.R | 14 +-- compareBed.sh | 268 ---------------------------------------------- maxScore.R | 10 -- merge.R | 23 ---- 6 files changed, 112 insertions(+), 327 deletions(-) delete mode 100644 compareBed.sh delete mode 100644 maxScore.R delete mode 100644 merge.R diff --git a/bin/compareBed.sh b/bin/compareBed.sh index bcf52fb..2153b48 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -100,19 +100,20 @@ then echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." echo "If a parameter is chosen, a value must be provided or an error will occur." echo "--------------------" - echo "usage: compareBed.sh --data \ --motifs \ --fasta \ \[optional_parameter \\] ..." + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints\; default is 10" - echo " -max --max maximum size of footprints\; default is 60" - echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" echo " -h --help shows this help message" exit 0 fi @@ -147,7 +148,7 @@ fi if [ $ou == false ] then - output="newFootprints.bed" + output=${workdir}/"newFootprints.bed" ou=true fi @@ -159,25 +160,109 @@ fi if [ $ma == false ] then - max=60 + max=80 ma=true fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + #1. first filter. no overlap vs. overlap -bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi + +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l + +echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed + +#3 subtract overlapping regions for additional motifs +echo "also get subsequences with no overlap of overlapping sequences" +help=true + +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + -#3. subtract overlapping regions for additional motifs -bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla merge.R $min $max "$workdir" +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/bin/maxScore.R b/bin/maxScore.R index 4ea545b..1dfed4b 100644 --- a/bin/maxScore.R +++ b/bin/maxScore.R @@ -1,9 +1,10 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args = commandArgs(TRUE) file = args[1] -tab = read.table(file, header=FALSE) +tab = fread(file, header=FALSE) colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") tab$maxpos = tab$start + tab$maxpos -write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge.R b/bin/merge.R index bbe3c9a..f491cb9 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -1,12 +1,13 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -19,5 +20,4 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - +fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/compareBed.sh b/compareBed.sh deleted file mode 100644 index 2153b48..0000000 --- a/compareBed.sh +++ /dev/null @@ -1,268 +0,0 @@ -#!/bin/bash -#data -#motifs -#workdir -#fasta -#min -#max -#output -wrong=() -da=false -mo=false -wo=false -fa=false -mi=false -ma=false -ou=false -he=false - -if [ $# -eq 0 ] -then - he=true -fi - -while [[ $# -gt 0 ]] -do -key="$1" - -case $key in - -d|--data) - data="$2" - da=true - shift - shift - ;; - -m|--motifs) - motifs="$2" - mo=true - shift - shift - ;; - -w|--workdir) - workdir="$2" - wo=true - shift - shift - ;; - -f|--fasta) - fasta="$2" - fa=true - shift - shift - ;; - -min|--min) - min=$2 - mi=true - shift - shift - ;; - -max|--max) - max=$2 - ma=true - shift - shift - ;; - -o|--output) - output="$2" - ou=true - shift - shift - ;; - -h|--help) - help=true - he=true - shift - ;; - *) - wrong+=("$1") - shift - ;; -esac -done - -count=${#wrong[@]} -if [ $count -gt 0 ] -then - for i in ${wrong[@]} - do - echo wrong parameter $i - echo call script without parameters for help or call --help - echo exit - done -exit 1 -fi - -if [ $he == true ] -then - echo "This script utilies bedtools to select new footprints from data." - echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information." - echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." - echo "If a parameter is chosen, a value must be provided or an error will occur." - echo "--------------------" - echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." - echo "--------------------" - echo " required parameter:" - echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" - echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" - echo " -f --fasta the path to the .fasta file of genome" - echo " " - echo " optional parameter:" - echo " -w --workdir the path to directory where temporary files will be created" - echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" - echo " -h --help shows this help message" -exit 0 -fi - -echo selected parameters -echo ------------------- -echo data: $da \(required\) -echo motifs: $mo \(required\) -echo workdir: $wo -echo fasta: $fa \(required\) -echo min: $mi -echo max: $ma -echo output: $ou -echo help: $he - -if [ $da == false ] || [ $mo == false ] || [ $fa == false ] -then - echo required parameters not given. - echo required are: --data \ --motifs \ --fasta \ - exit 1 -fi - -if [ $wo == false ] -then - if [ ! -d workdir1337 ] - then - mkdir workdir1337 - fi - wo=true - workdir="workdir1337" -fi - -if [ $ou == false ] -then - output=${workdir}/"newFootprints.bed" - ou=true -fi - -if [ $mi == false ] -then - min=10 - mi=true -fi - -if [ $ma == false ] -then - max=80 - ma=true -fi - -if [ ! -d $workdir ] -then - mkdir $workdir -fi - -#1. first filter. no overlap vs. overlap -echo get sequences with no overlap -cat $data > "$workdir"/pass1Tr.bed -help=true - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -fi - -if [ $help == false ] -then - cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed -fi -echo get overlapping sequences -bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed -cat "$workdir"/pass1Fa.bed | wc -l - -echo calc maxScore -#2. compute absolut maxscore position -Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed - -#3 subtract overlapping regions for additional motifs -echo "also get subsequences with no overlap of overlapping sequences" -help=true - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -fi - - -if [ $help == false ] -then - cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed -else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed -fi - -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" - -#5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" -bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta - -#6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/maxScore.R b/maxScore.R deleted file mode 100644 index 1dfed4b..0000000 --- a/maxScore.R +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/Rscript -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file, header=FALSE) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -tab$maxpos = tab$start + tab$maxpos - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R deleted file mode 100644 index f491cb9..0000000 --- a/merge.R +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/Rscript -library(data.table) -args=commandArgs(TRUE) -min=as.numeric(args[1]) -max=as.numeric(args[2]) -folder=args[3] -splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -p1$maxpos = p1$start + p1$maxpos - -splitted=rbind(splitted, p1) - -splitted=splitted[which(splitted$stop - splitted$start >= min),] -splitted=splitted[which(splitted$stop - splitted$start <= max),] -splitted$id=make.unique(as.character(splitted$id)) -splitted$length=splitted$stop - splitted$start - -splitted=cbind(splitted, containsMaxpos=0) -splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 -splitted$maxpos = splitted$maxpos - splitted$start -fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 6ff051d13f9ffe82759b96786e16c321001206f3 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 5 Dec 2018 15:22:24 +0100 Subject: [PATCH 047/158] Delete .gitignore --- .gitignore | 168 ----------------------------------------------------- 1 file changed, 168 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index bac574a..0000000 --- a/.gitignore +++ /dev/null @@ -1,168 +0,0 @@ -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/ - From 02e946ddf46bd6ed92027115117c079c5ea75b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 09:38:23 -0500 Subject: [PATCH 048/158] added parameter to yaml --- masterenv.yml | 7 +++++++ pipeline.nf | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/masterenv.yml b/masterenv.yml index 846cb71..b25f948 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -21,3 +21,10 @@ dependencies: - r-optparse - bioconductor-iranges # + - snakemake + - meme + - moods + - biopython + - pybedtools + - matplotlib + - seaborn diff --git a/pipeline.nf b/pipeline.nf index 5ed2eac..071771a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -8,6 +8,13 @@ Channel.fromPath(params.config).set {config} bigwig_input.combine(bed_input).set{footprint_in} +//setting default values +params.input="" +params.bed="" +params.genome_fasta="" +params.jaspar_db="" +params.config="" + process footprint_extraction { conda "${path_env}" @@ -40,6 +47,7 @@ process extract_known_TFBS { script: """ + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./ """ } From 1cdd1c25eb763062f62df934ca31cb3e49115295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 09:44:14 -0500 Subject: [PATCH 049/158] added tfbsscan.py to bin --- bin/tfbsscan.py | 755 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 755 insertions(+) create mode 100644 bin/tfbsscan.py diff --git a/bin/tfbsscan.py b/bin/tfbsscan.py new file mode 100644 index 0000000..2bae662 --- /dev/null +++ b/bin/tfbsscan.py @@ -0,0 +1,755 @@ +""" +TFBSscan.py produces the data to be used to join the footprint and motif information across genome + +@author: Anastasiia Petrova +@contact: anastasiia.petrova(at)mpi-bn.mpg.de + +""" + +import argparse +import sys +import os +import re +import time +import multiprocessing +import logging +import subprocess +from Bio import SeqIO +import Bio.SeqIO.FastaIO as bio +import textwrap +import MOODS.scan +import MOODS.tools +import MOODS.parsers + +logger = logging.getLogger('mytool') +logger.setLevel(logging.INFO) + +formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") + +fh = logging.FileHandler('final_log.txt') +fh.setLevel(logging.INFO) +fh.setFormatter(formatter) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(formatter) +logger.addHandler(ch) + +#catch all the information about input and output files as well as information on the used tool (fimo or moods) +def parse_args(): + parser = argparse.ArgumentParser(prog = 'mytool_ver6', description = textwrap.dedent(''' + + This script takes a list of motifs loaded from jaspar.genereg.net as a combined text file in MEME or .PFM format, a genome file in FASTA format and optionaly a .bed file (the one you want to be merged with the whole genome file) as input. If you want to merge a .bed file with the whole genome file, please enter --bed_file or -b bevor your .bed file. The tool will provide a file output_merge.fa, which you can also use for your research later on. If you already have a merged file, please give this one as genome file input. If there are several motifs in the input file, the tool will create a separate output file for each motif. Choose if you want to use fimo or moods with --use, this script uses by default fimo. Please note that the tool can not provide the calculation of q values with fimo due to the text mode that fimo needs to use. The tool sends merged genome file and motifs to fimo or moods, saves the sorted output for each of the given motifs as moods/fimo_output_[alternate name and id of the motif].txt in the output directory, then calculates the start and the end as real positions on the chromosom and writes this information in the ouput files. The columns in the output file are: chromosom, start, end, the name and score of TF. If a .bed file was given as input, the tool will also add the additional columns from it to the output. If the output file is empty, there were no machtes within given genome regions. Please note, if you want to have all intermediate output files, enter --clean nothing + + '''), epilog='That is what you need to make this script work for you. Enjoy it') + + required_arguments = parser.add_argument_group('required arguments') + + required_arguments.add_argument('-m', '--motifs', help='file in MEME format with mofits loaded from jaspar.genereg.net') + required_arguments.add_argument('-g', '--genome', help='a whole genome file or regions of interest in FASTA format to be scanned with motifs') + + #all other arguments are optional + parser.add_argument('-o', '--output_directory', default='output', const='output', nargs='?', help='output directory, default ./output/') + parser.add_argument('-b', '--bed_file', nargs='?', help='a .bed file to be merged with the whole genome file to find regions of interest') + parser.add_argument('--use', '--use_tool', default='fimo', const='fimo', nargs='?', choices=['fimo', 'moods'], help='choose the tool to work with, default tool is fimo') + parser.add_argument('--clean', nargs='*', choices=['nothing', 'all', 'cut_motifs', 'fimo_output', 'merge_output', 'moods_output'], dest='cleans', help='choose the files you want to delete from the output directory, the default is deleting all the temporary files from the directory') + parser.add_argument('--fimo', help='enter additional options for fimo using = inside "", for example fimo="--norc" to not score the reverse complement DNA strand. By default the --text mode is used and the calculation of the q values due to the --text mode is not possible') + parser.add_argument('--cores', type=int, help='number of cores allowed to use by this tool, by default the tool uses 2 cores', default=2) + parser.add_argument('-p', '--p_value', type=float, help='enter the p value, the default p value is 0.0001. Please note that if you enter the p value using --fimo="--thresh ..." as well, the one within --fimo call will be used', default=0.0001) + parser.add_argument('--resolve_overlaps', action='store_true', help='delete overlaps with greater p value, by default no overlaps are deleted') + parser.add_argument('--hide_info', action='store_true', help='while working with data write the information only into ./log.txt') + parser.add_argument('--moods_bg', nargs='+', type=float, help='set the bg for moods, by default moods uses the bg is 0.25 0.25 0.25 0.25') + + args = parser.parse_args() + return args + +def check_directory(directory): + if not os.path.exists(directory): + os.makedirs(directory) + logger.info('a new directory ' + directory + ' was created') + +#merge the whole genome with the regions mentioned in .bed file +def merge(genome, bed_file, output_directory): + logger.info('the merging of files ' + genome + ' and ' + bed_file + ' will end soon, the result file is output_merge.fa') + output_merge = os.path.join(output_directory, "output_merge.fa") + os.system("bedtools getfasta -fi " + genome + " -bed " + bed_file + " -fo " + output_merge) + return output_merge + +#split the motifs each in other file +def split_motifs(motifs, output_directory, usage): + logger.info("the file with motifs " + motifs + " will be checked for motifs and if needed splitted in files each containing only one motif") + + first_line = subprocess.getoutput("head -1 " + motifs) #find the first line of the input file + + if usage == "moods": + if first_line.startswith(">"): + #the motif file probably has the .pfm format, try to read and split it + splitted_motifs = read_pfm(motifs, output_directory) + else: #maybe the file with motifs is in MEME format, so try to convert it + logger.info("the input file has not the expected format, I will try to convert it to .pfm format") + splitted_motifs = convert_meme_to_pfm(motifs, output_directory) + + elif usage == "fimo": + if first_line.startswith("MEME version"): + #the motifs file has probably the MEME format, try to read and split it + splitted_motifs = read_meme(motifs, output_directory) + + #if the there was a convertion before, delete all the .pfm files as we don't need them + for filename in os.listdir(output_directory): + if filename.endswith(".pfm"): + remove_file(os.path.join(output_directory, filename)) + + else: #maybe the file with motifs is in .pfm format, so try to convert is + logger.info("the input file has not the expected format, I will try to convert it to MEME format") + splitted_motifs = convert_pfm_to_meme(motifs, output_directory) + + return splitted_motifs + +def read_pfm(motifs, output_directory): + splitted_motifs = [] #to save the names of files after splitting + motif = [] #to save the motif itself, which will be written to the file + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + #as the motif has first line with the name and 4 lines with information, if the 5th line is something else than the name of the next motif, the exit will be forced + if lines == 5 and not line.startswith(">"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + else: + if line.startswith(">"): + if 'written_file' in locals(): + written_file.write(''.join(motif)) + motif = [] + lines = 0 + written_file.close() + + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_id = re.split(' ', line[1:])[0] #[1:] meands do not use the first character + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') + + splitted_motifs.append(motif_name) + written_file = open(motif_name, 'w') + + if lines >= 1 and lines <= 4: #one motif has 5 lines, the first consists the name, the next 4 - the information we need to proceed the data within moods + motif.append(line) + + lines = lines + 1 + written_file.write(''.join(motif)) + written_file.close() + + return splitted_motifs + +def read_meme(motifs, output_directory): + splitted_motifs = [] #to save the names of files after splitting + motif = [] #to save the motif itself, which will be written to the file + head = [] #define a list for header, as fimo needs a header in each motif file it proceedes + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + #make the head part + if lines <= 8: + if lines == 0 and not line.startswith("MEME version"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + + head.append(line) + else: + #search for motifs and save each to another file + if line.startswith("MOTIF"): + if 'written_file' in locals(): + written_file.write(''.join(motif)) + motif = [] + written_file.close() + + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' + if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so + motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) + motif_id = re.split(' ', line)[1] + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.meme') + else: + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_name = os.path.join(output_directory, motif_alternate_name + '.meme') + + #make a list with all the motif names to know which files to iterate when fimo is called + splitted_motifs.append(motif_name) + + written_file = open(motif_name, 'w') + written_file.write(''.join(head)) + + motif.append(line) + + lines = lines + 1 + + #write the last motif + written_file.write(''.join(motif)) + written_file.close() + + read_file.close() + + return splitted_motifs + +def convert_meme_to_pfm(motifs, output_directory): + #i can only convert the file to pfm if the motifs file is in MEME format + + splitted_motifs = [] #to save the names of files after splitting + rows = [[] for row in range(4)] + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + if lines == 0 and not line.startswith("MEME version"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + else: + #search for motifs and save each to another file + if line.startswith("MOTIF"): + + if 'written_file' in locals(): + for row in rows: + written_file.write('\t'.join(row) + '\n') + + rows = [[] for row in range(4)] + + written_file.close() + + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' + if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so + motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) + motif_id = re.split(' ', line)[1] + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') + + else: + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_name = os.path.join(output_directory, motif_alternate_name + '.pfm') + + #make a list with all the motif names to know which files to iterate when fimo is called + splitted_motifs.append(motif_name) + + written_file = open(motif_name, 'w') + + elif line.startswith("letter-probability matrix"): + columns = int(re.split(' ', re.split('w= ', line)[1])[0]) #find the number of columns from the line out of the motifs file + nsites = int(re.split(' ', re.split('nsites= ', line)[1])[0]) #find the nsites to count the frequency count for .pfm file + + elif line.startswith(' '): #each line with information about frequency starts in MEME format with ' ' + for i in range(len(rows)): + rows[i].append(str(round(float(re.findall(r'\S+', line)[i])*nsites))) #split the line, do not mention how much whitespaces are in between, multiply it with nsites and save it to the corresponding row + + lines = lines + 1 + + #write the last motif + for row in rows: + written_file.write('\t'.join(row) + '\n') + + written_file.close() + read_file.close() + + return splitted_motifs + +def convert_pfm_to_meme(motifs, output_directory): + #i can only convert the file to meme, if motifs file is in .pfm format + + #first we need to split the pfm motifs as the jaspar2meme does not work with the files containing several motifs, but with the directory consisting files each with only one motif in pfm format + pfm_motifs = read_pfm(motifs, output_directory) + + converted_name = os.path.join(output_directory, "converted_motifs.meme") + + os.system("jaspar2meme -pfm " + output_directory + " > " + converted_name) + + #need to call split motifs for meme file + splitted_motifs = split_motifs(converted_name, output_directory, "fimo") + + remove_file(converted_name) + return splitted_motifs + +#if there are chars that are not allowed, they will be replaced with '_', to the possibly invalid names there will be added '_' at the beginning of the name +def check_name(name_to_test): + badchars= re.compile(r'[^A-Za-z0-9_. ]+|^\.|\.$|^ | $|^$') + badnames= re.compile(r'(aux|com[1-9]|con|lpt[1-9]|prn)(\.|$)') + + #replace all the chars that are not allowed with '_' + name = badchars.sub('_', name_to_test) + + #check for the reserved by the os names + if badnames.match(name): + name = '_' + name + return name + +#use fimo to make a file +def call_fimo(fimo_data, p_value, one_motif, genome, output_directory): + + #make the filename for the fimo output + fimo_output_file = os.path.join(output_directory, "fimo_output_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") + fimo_output_unsorted = os.path.join(output_directory, "fimo_output_unsorted_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") + + #check if user needs special options for the fimo + if fimo_data != None: + fimo_data = fimo_data + " --thresh " + str(p_value) + " " + else: + fimo_data = "--thresh " + str(p_value) + " " #add the passed p value to the fimo options + + #call fimo for this motif and save the output to a temporary file + send_to_fimo = "fimo --text --no-qvalue " + fimo_data + one_motif + " " + genome + " > " + fimo_output_unsorted + + logger.info('fimo proceed the data using this call ' + send_to_fimo) + + fimo_stdout = subprocess.getoutput(send_to_fimo) + + #the normal output from fimo starts with Using motif ..., so print it from the logger, otherwise print what else fimo says + if fimo_stdout.startswith("Using") and re.split('\n', fimo_stdout)[1]: + logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[0].rstrip()) + logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[1].rstrip()) + else: #there were some problems with fimo, so we want to see what they were + logger.info('info from fimo: ' + fimo_stdout) + + + if not os.path.isfile(fimo_output_unsorted): + logger.info('the usage of fimo was crashed, there is no required output file, the exit is forced') + sys.exit() + + if os.stat(fimo_output_unsorted).st_size == 0: #if the output of fimo is empty + fimo_output_unsorted = fimo_output_unsorted.replace('unsorted_', '') + return fimo_output_unsorted + else: + #if the file was converted from pfm, the second column contains the positions, so we want to sort using this column, and not the next one + second_line = subprocess.getoutput("sed -n '2{p;q}' " + fimo_output_unsorted) + if re.split('\t', second_line)[2].startswith("chr"): #the re.split[1] is a ' ', so take the [2] + os.system("cat " + fimo_output_unsorted + " | sort -k 2 -V > " + fimo_output_file) + else: + #we are sorting after the third column, which looks like chr1:123-126, -V means it will sort the digitals and not the strings + os.system("cat " + fimo_output_unsorted + " | sort -k 3 -V > " + fimo_output_file) + + #make sure the output of fimo exists + if not os.path.isfile(fimo_output_file): + logger.info('the sorting of the output file from the fimo was crashed, the exit is forced') + sys.exit() + else: + return fimo_output_file + +def call_moods(one_motif, genome, output_directory, p_value, moods_bg): + + # setting standard parameters for moods + pseudocount = 0.0001 + + if moods_bg == None: + bg = MOODS.tools.flat_bg(4) + else: + bg = tuple(moods_bg) + + logger.info("moods will work with the p_value " + str(p_value) + " and the bg " + str(bg)) + + motif_name = os.path.basename(one_motif) + + moods_output_unsorted_name = os.path.join(output_directory, "moods_output_unsorted_" + os.path.splitext(motif_name)[0] + ".txt") + moods_output_file_unsorted = open(moods_output_unsorted_name, 'w') + + moods_output_name = os.path.join(output_directory, "moods_output_" + os.path.splitext(motif_name)[0] + ".txt") + moods_output_file = open(moods_output_name, 'w') + + matrix_names = [os.path.basename(one_motif)] + + matrices = [] + matrices_rc = [] + + valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount) + + key_for_bed_dict = '' + + if valid: + + logger.info("please be patient, moods is working on the data") + + matrices.append(matrix) + matrices_rc.append(MOODS.tools.reverse_complement(matrix,4)) + matrices_all = matrices + matrices_rc + thresholds = [MOODS.tools.threshold_from_p(m, bg, p_value, 4) for m in matrices_all] + + scanner = MOODS.scan.Scanner(7) + scanner.set_motifs(matrices_all, bg, thresholds) + + with open(genome) as handle: + + seq_iterator = bio.SimpleFastaParser(handle) + + for header, seq in seq_iterator: + + header_splitted = re.split(r':', header) + + if len(header_splitted) == 1: #if there are no positions given + header = header + ":0-" #set the first position as 0 and split it once more + header_splitted = re.split(r':', header) + logger.info("moods works with " + header) + else: #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on + key_for_bed_dict = header + + chromosom = header_splitted[0] + positions = re.split(r'-', header_splitted[-1]) + + results = scanner.scan(seq) + + fr = results[:len(matrix_names)] #forward strand + rr = results[len(matrix_names):] #reverse strand + + results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + + [(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names))] #use + and - to indicate strand + + for (matrix, matrix_name, result) in zip(matrices, matrix_names, results): + + motif_id = re.split(r'_', matrix_name)[-1] #find the id of the given morif + motif_alternate_name = matrix_name.replace(motif_id, '')[:-1] #the alternate name of the motif is the name of the file without id and with cutted last character, that is _ + + if len(matrix) == 4: + l = len(matrix[0]) + if len(matrix) == 16: + l = len(matrix[0] + 1) + for r in sorted(result, key=lambda r: r[0]): + strand = r[2] + pos = r[0] + hitseq = seq[pos:pos+l] #sequence + #score = r[1] + score = format(r[1], '.15f') #round to 15 digits after floating point, already type str + + if key_for_bed_dict != '': + start = pos + 1 + end = pos + len(hitseq) + chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file + else: + start = int(positions[0]) + pos + 1 + end = start + len(hitseq) - 1 + + #moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, str(score)]) + '\n') + moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, score]) + '\n') + + #now sort the output of moods + os.system("cat " + moods_output_unsorted_name + " | sort -k 1 -V > " + moods_output_name) + + moods_output_file_unsorted.close() + moods_output_file.close() + + return moods_output_name + + else: + logger.info("The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)") + sys.exit() + +#help function for the moods call, convert pfm to log odds +def pfm_to_log_odds(filename, bg, pseudocount): + if pfm(filename): + mat = MOODS.parsers.pfm_to_log_odds(filename, bg, pseudocount) + if len(mat) != 4: #if something went wrong, the empty list will be returned + return False, mat + else: + return True, mat + else: + logger.info('please make sure the motif file has a .pfm format needed for moods') + sys.exit() + +#help function for the moods call, check if the file is in a pfm format using moods +def pfm(filename): + mat = MOODS.parsers.pfm(filename) + if len(mat) != 4: + return False + else: + return True + +# calculate the real positions of TFs, if needed, resolve the overlaps, and write to the output file +def write_output_file(input_file, bed_dictionary, resolve_overlaps): + + if os.path.basename(input_file).startswith("moods"): + name_without_moods_or_fimo = input_file.replace('moods_output_', '') + used_tool = "moods" + else: + name_without_moods_or_fimo = input_file.replace('fimo_output_', '') + used_tool = "fimo" + + output_file_name = os.path.splitext(name_without_moods_or_fimo)[0] + ".bed" + + logger.info('writing the output file ' + output_file_name) + output_file = open(output_file_name, 'w') + + #calculate the real positions of TFs and write the information in the output file + with open(input_file) as read_file: + + overlap = [] + printed_line = [] + last_line = [] + key_for_bed_dict = '' + + for line in read_file: + if not line.startswith('#'): + + line_to_write = [] + + line_array = re.split(r'\t', line.rstrip('\n')) + + chromosom_and_positions = re.split(r':', line_array[2]) + if len(chromosom_and_positions) == 1: #the whole genome was given, there is nothing to split + chromosom = line_array[2] + start = line_array[3] + end = line_array[4] + else: + positions = re.split(r'-', chromosom_and_positions[-1]) + chromosom = chromosom_and_positions[0] + start = str(int(positions[0]) + int(line_array[3])) + end = str(int(positions[0]) + int(line_array[4])) + key_for_bed_dict = line_array[2] #use only if there is a bed_dictionary in input + + #------- these are 5 needed columns to succesfully proceed the data + + name = os.path.splitext(os.path.basename(name_without_moods_or_fimo))[0] + + score = line_array[6] + + line_to_write.extend([chromosom, start, end, name, score]) + #------ here the additional information coule be added to the output file + + strand_inf = line_array[5] + line_to_write.append(strand_inf) + + if used_tool == "fimo": + p_value = line_array[7] + line_to_write.append(p_value) + + #if the dictionary is not empty check for the information corresponding to these positions + if bed_dictionary and key_for_bed_dict in bed_dictionary: + line_to_write.append('\t'.join(bed_dictionary[key_for_bed_dict])) + + line_to_write.insert(0, "write") #insert a flag to know if the line should be written or not + + last_line = line_to_write #save the line in case it is the last line and if due to the check_overlap it could not be printed + + if resolve_overlaps: + overlap, line_to_write, printed_line = check_overlap(line_to_write, overlap, printed_line, output_file) + + write_line_not_overlap(output_file, line_to_write) + + if not last_line[0].startswith('write'): #it could be that the write flag was deleted from the last_line so append it back + overlap.insert(0, "write") + + #if there is already any printed line, check if the saved last line was already printed. Otherwise print it + if resolve_overlaps: + if printed_line: + if last_line[1] != printed_line[1] or last_line[2] != printed_line[2]: + write_line_not_overlap(output_file, last_line) + + output_file.close() + +def write_line_not_overlap(output_file, line_to_write): + if line_to_write: #if line_to_write is not empty + if line_to_write[0].startswith('write'): #if the line does not contain an overlap, it contains the flag "write" at the first position + line_to_write.pop(0) #delete the flag + output_file.write('\t'.join(line_to_write) + '\n') + +def check_overlap(line_to_write, overlap, printed_line, output_file): + + is_overlap = None + + if not overlap: #if the overlap list is empty + is_overlap = False + + else: #if the overlap list is not empty + + if not overlap[0].startswith('write'): #it could be that the write flag was deleted from the overlap so append it back to make sure the next if clauses run right + overlap.insert(0, "write") + + if line_to_write[1] == overlap[1] and float(line_to_write[2]) < float(overlap[3]): #the current line could overlap the previous because the start of the current line is smaller than the end of the previous one and they are both on the one chromosom + #if p value in the current line is smaller than p value in the previous line (or score is bigger), save the current line as possible overlap for future + if float(line_to_write[5]) > float(overlap[5]): + is_overlap = False + else: + #if the p value in the current line is greater than p value in the previous line or are these both p values the same, the current line will not be printed, but also it will not be saved + line_to_write.pop(0) + is_overlap = None + else: #it is an other chromosom or the start at the current line is greater or the same as the end of the previous one + if printed_line != overlap: + is_overlap = True + else: + is_overlap = False + + if is_overlap == False: + overlap = line_to_write #save the current line + line_to_write.pop(0) #do not print anything due to deleting the flag ("write") + elif is_overlap == True: + if not overlap[0].startswith('write'): + overlap.insert(0, "write") + printed_line = overlap #the previous line is saved as temporary line to print it later on + overlap = line_to_write #save the current line + line_to_write = printed_line #print the temporary saved line + + return overlap, line_to_write, printed_line + +def remove_file(file): + if os.path.isfile(file): + os.remove(file) + +def clean_directory(cleans, output_directory, motif, tool_output_file): + + fimo_output_unsorted = os.path.join(tool_output_file.replace("fimo_output", "fimo_output_unsorted")) + moods_output_unsorted = os.path.join(tool_output_file.replace("moods_output", "moods_output_unsorted")) + + for clean in cleans: + if clean == 'all': + remove_file(motif) + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + remove_file(moods_output_unsorted) + elif clean == 'fimo_output': + if os.path.basename(tool_output_file).startswith("fimo"): + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + elif clean == 'cut_motifs': + remove_file(motif) + elif clean == 'moods_output': + if os.path.basename(tool_output_file).startswith("moods"): + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + +def tool_make_output(usage, motif, genome, output_directory, cleans, p_value, bed_dictionary, fimo_data, resolve_overlaps, moods_bg): + try: + if usage == "fimo": + tool_output_file = call_fimo(fimo_data, p_value, motif, genome, output_directory) + elif usage == "moods": + tool_output_file = call_moods(motif, genome, output_directory, p_value, moods_bg) + + output = write_output_file(tool_output_file, bed_dictionary, resolve_overlaps) + finally: + clean_directory(cleans, output_directory, motif, tool_output_file) + +def multiprocess(motifs, genome, output_directory, cleans, fimo_data, p_value, bed_dictionary, cpu_count, resolve_overlaps, usage, moods_bg): + + if cleans == None: + cleans = ['all'] + + pool = multiprocessing.Pool(cpu_count) #by default is cpu_count 2 + + length = len(motifs) #the number of the motifs to find the percentage of the job that was done + step = 100/length #the percentage that should be added after the job with each single motif is done + + tasks = [] #here the jobs done by processes are saved + + for motif in motifs: + tasks.append(pool.apply_async(tool_make_output, args = (usage, motif, genome, output_directory, cleans, p_value, bed_dictionary, fimo_data, resolve_overlaps, moods_bg, ))) + + tasks_done = sum([task.ready() for task in tasks]) #the number of the processes that ended their job + + #check the number of the processes that are ready till the number of them reaches the number of processes started in the pool + while tasks_done < len(tasks): + #if the number of ready processes has changed, save the new number and print the percentage of the job done + if sum([task.ready() for task in tasks]) != tasks_done: + tasks_done = sum([task.ready() for task in tasks]) + sys.stdout.write("%-100s| %d%% \r" % (''*tasks_done, step*tasks_done)) + sys.stdout.flush() + sys.stdout.write("\n") + #update the number of ready processes each 0.05 seconds + time.sleep(0.05) + + pool.close() + pool.join() #make sure all the processes are done and exit + + #the processes should not delete the merged genome file + #so make sure if this file is needed, otherwise delete it + for clean in cleans: + if clean == 'all' or clean == 'merge_output': + for filename in os.listdir(output_directory): + if filename == "output_merge.fa": + remove_file(genome) + + if clean != 'nothing': + logger.info('the directory ' + output_directory + ' was cleaned, only the required files were left') + +def make_bed_dictionary(bed_file): + + bed_dictionary = {} + + with open(bed_file) as read_bed_file: + for bed_line in read_bed_file: + bed_line_array = re.split(r'\t', bed_line.rstrip('\n')) + if bed_line_array[1].isdigit() and bed_line_array[2].isdigit() and int(bed_line_array[1]) <= int(bed_line_array[2]): #in the real bedfile the second column is a start position, and the third column is an end position, so we are checking if these are integers and if the start position is smaller than the end one + key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2] + value = [] + for i in range(3, len(bed_line_array)): + value.append(bed_line_array[i]) + + bed_dictionary[key] = value + else: #this is not a bed file, force exit + logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line) + sys.exit() + + read_bed_file.close() + + return bed_dictionary + +def is_fasta(check_fasta): + if not os.path.isfile(check_fasta): + logger.info('there is no file with genome, the exit is forced') + sys.exit() + else: + # modified code from https://stackoverflow.com/questions/44293407/how-can-i-check-whether-a-given-file-is-fasta + + with open(check_fasta, "r") as handle: + fasta = SeqIO.parse(handle, "fasta") + return any(fasta) # False when `fasta` is empty, i.e. wasn't a FASTA file + +def check_existing_input_files(args): + if not args.motifs or not args.genome: + logger.info('there is no satisfied input, please enter --help or -h to learn how to use this tool') + sys.exit() + elif not is_fasta(args.genome): + logger.info('please make sure the input genome file has a fasta format') + sys.exit() + #check if the file with motifs exists + elif not os.path.isfile(args.motifs): + logger.info('there is no file with motifs, the exit is forced') + sys.exit() + #if the bedfile was given as input, check if this file exists + elif args.bed_file: + if not os.path.isfile(args.bed_file): + logger.info('there is no such bed file ' + args.bed_file + ', the exit is forced') + sys.exit() + +def check_fimo_version(): + fimo_version = subprocess.getoutput("fimo --version") #works only on python 3.4 + #fimo_version = int(fimo_version.replace('.', '')) #replace the . in the version to be able to compare it as int + if fimo_version < "4.12.0": + logger.info('please make sure you are using fimo version 4.12.0 or the newer one') + sys.exit() + +def main(): + + + args = parse_args() + + if args.use == "fimo": + check_fimo_version() + + #if user do not want to see the information about the status of jobs, remove the handler, that writes to the terminal + if args.hide_info: + #logger.disabled = True + logger.removeHandler(ch) + + check_existing_input_files(args) + + #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user + check_directory(args.output_directory) + + splitted_motifs = split_motifs(args.motifs, args.output_directory, args.use) + + #check if there is a .bed file to merge the genome file with. If so, merge them + if args.bed_file: + bed_dictionary = make_bed_dictionary(args.bed_file) + args.genome = merge(args.genome, args.bed_file, args.output_directory) + else: + bed_dictionary = {} + + #if the usage is moods, call moods, otherwise call fimo + multiprocess(splitted_motifs, args.genome, args.output_directory, args.cleans, args.fimo, args.p_value, bed_dictionary, args.cores, args.resolve_overlaps, args.use, args.moods_bg) + + for handler in logger.handlers: + handler.close() + logger.removeFilter(handler) + +if __name__ == "__main__": + main() \ No newline at end of file From b3fabce937f2206dbebbdd12f2a72226bac31e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:25:49 -0500 Subject: [PATCH 050/158] added and moved parameter in config --- config/cluster.config | 1 - config/motif_estimation.config | 7 ++++--- nextflow.config | 13 +++++++------ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/config/cluster.config b/config/cluster.config index 3dce063..0a21a05 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -3,5 +3,4 @@ params{ kmer=10 aprox_motif_len=10 - threads=1 } diff --git a/config/motif_estimation.config b/config/motif_estimation.config index a00fc86..d280b3e 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,11 +1,12 @@ params { //bed_to_clustered_fasta min_seq = 10 - + //glam2 - motif_min_len = 10 + motif_min_len = 8 motif_max_len = 20 + interation = 10000 //tomtom tomtom_treshold = 0.01 -} \ No newline at end of file +} diff --git a/nextflow.config b/nextflow.config index 2e84338..c0a683b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,12 +1,13 @@ createTimeout = 60 - -env { - path_bin = $workflow.projectDir/bin - path_env = $workflow.projectDir/masterenv.yml -} +params.threads=1 //Parameter for for scripts! Not for nextflow processes. includeConfig '$workflow.projectDir/config/peak_calling.config' -includeConfig '$workflow.projectDir/config/1.2.config' +includeConfig '$workflow.projectDir/config/filter_unknown_motifs.config' includeConfig '$workflow.projectDir/config/cluster.config' includeConfig '$workflow.projectDir/config/motif_estimation.config' includeConfig '$workflow.projectDir/config/create_gtf.config' + +env { + path_bin = $workflow.projectDir/bin + path_env = $workflow.projectDir/masterenv.yml +} From 760d91d2126ca2127f2e1a7a9c077d905d37cf48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:26:29 -0500 Subject: [PATCH 051/158] added default parameter in script --- pipeline.nf | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index 071771a..194bf38 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -15,6 +15,38 @@ params.genome_fasta="" params.jaspar_db="" params.config="" +//peak_calling + params.window_length = 200 + params.step = 100 + params.percentage = 0 + +//filter_unknown_motifs + params.min_size_fp=10 + params.max_size_fp=100 + +//clustering + params.sequence_coverage=8 + + params.kmer=10 + params.aprox_motif_len=10 + +//motif_estimation + //bed_to_clustered_fasta + params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 + + //glam2 + params.motif_min_len = 8 // Minimum length of Motifs + params.motif_max_len = 20 // Maximum length of Motifs + params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals higher more accurate but higher runtime. + + //tomtom + params.tomtom_treshold = 0.01 // threshold for similarity score. + +//creating_gtf + params.organism="homo_sapiens" + params.tissue="" + + process footprint_extraction { conda "${path_env}" @@ -143,7 +175,7 @@ process glam2 { script: """ - glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 + glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} """ } From ef3343846e651c7e69b4c094ca131a1661a056e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:29:45 -0500 Subject: [PATCH 052/158] moved uropa.config to config dir --- uropa.config => config/uropa.config | 0 nextflow.config | 1 + 2 files changed, 1 insertion(+) rename uropa.config => config/uropa.config (100%) diff --git a/uropa.config b/config/uropa.config similarity index 100% rename from uropa.config rename to config/uropa.config diff --git a/nextflow.config b/nextflow.config index c0a683b..a3c06eb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,6 @@ createTimeout = 60 params.threads=1 //Parameter for for scripts! Not for nextflow processes. +params.config="${workflow.projectDir}/config/uropa.config" includeConfig '$workflow.projectDir/config/peak_calling.config' includeConfig '$workflow.projectDir/config/filter_unknown_motifs.config' From 4ff3088b0c5e39b8523cde20e92d2ef5e2fc1e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:53:43 -0500 Subject: [PATCH 053/158] added help message --- pipeline.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pipeline.nf b/pipeline.nf index 194bf38..3610955 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -47,6 +47,14 @@ params.config="" params.tissue="" +if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ +log.info """ +NAME +Usage: nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] +""" +} + + process footprint_extraction { conda "${path_env}" From 01f34921627f921002604c4c89e30079c75180ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 03:47:49 -0500 Subject: [PATCH 054/158] added new config file 'filter_unknown_motifs' --- config/filter_unknown_motifs.config | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 config/filter_unknown_motifs.config diff --git a/config/filter_unknown_motifs.config b/config/filter_unknown_motifs.config new file mode 100644 index 0000000..79033c4 --- /dev/null +++ b/config/filter_unknown_motifs.config @@ -0,0 +1,4 @@ +params{ + min_size_fp=10 + max_size_fp=100 +} From 8476db5a21a5891f34574ef55375289f25caed66 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 09:51:41 +0100 Subject: [PATCH 055/158] implemented motif_occurence for cutoff computation --- bin/reduce_bed.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index d539399..b3c1c6d 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -10,8 +10,8 @@ option_list <- list( make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), - make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer") - # TODO more args + make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.") ) opt_parser <- OptionParser(option_list = option_list, @@ -30,10 +30,11 @@ opt <- parse_args(opt_parser) #' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 #' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) #' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). +#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. #' #' @return reduced bed #' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer))) { +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { # get number of available cores if (threads == -1) { threads <- parallel::detectCores() @@ -93,7 +94,7 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr data.table::setorder(kmer_counts, -V2) # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif) + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurence = motif_occurence) # reduce kmer reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) @@ -136,12 +137,16 @@ reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", thr #' @param kmer Length of kmer #' @param motif Length of motif #' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' @param motif_occurence Define how many motifs are expected per sequence. Default = 1 #' #' @return Number of interesting kmer. -significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) { +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurence = 1) { if (minoverlap > kmer || minoverlap > motif) { stop("Kmer & motif must be greater or equal to minoverlap!") } + if (motif_occurence <= 0) { + stop("Motif_occurence must be a numeric value above 0!") + } # minimum sequence length to get all interesting overlaps min_seq_length <- motif + 2 * (kmer - minoverlap) @@ -154,7 +159,7 @@ significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2)) # calculate max possible kmer topx <- sum(seq_lengths - kmer + 1) - return(topx) + return(topx * motif_occurence) } #' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. @@ -179,6 +184,8 @@ reduce_kmer <- function(kmer, significant) { #' @param threads Number of threads. #' #' @return Data.table with relative positions and width (start, end, width). +#' +#' TODO Include number of motifs per sequence (aka motif_occurence). Attempt to keep best 2 regions for occurence = 2? Probably high impact on performance. find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { if (nchar(kmer_counts[1, 1]) <= minoverlap) { stop("Minoverlap must be smaller than kmer length!") From f0e2cef3c6fe4a0a81144b6dfbbadfa651f753f7 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 09:56:06 +0100 Subject: [PATCH 056/158] threads = 0 for all cores to match cd-hit --- bin/reduce_bed.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index b3c1c6d..c022b43 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -6,7 +6,7 @@ option_list <- list( make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use -1 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), @@ -25,7 +25,7 @@ opt <- parse_args(opt_parser) #' @param kmer Length of kmer. #' @param motif Estimated motif length. #' @param output Output file -#' @param threads Number of threads. +#' @param threads Number of threads. Default = 1. 0 for all cores. #' @param clean Delete all temporary files. #' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 #' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) @@ -36,7 +36,7 @@ opt <- parse_args(opt_parser) #' TODO check whether jellyfish is installed reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { # get number of available cores - if (threads == -1) { + if (threads == 0) { threads <- parallel::detectCores() } From 1d466c5417030b9a438ff91685c2482f69447c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 03:58:32 -0500 Subject: [PATCH 057/158] help message --- pipeline.nf | 56 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 3610955..e694ba0 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -6,8 +6,6 @@ Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} -bigwig_input.combine(bed_input).set{footprint_in} - //setting default values params.input="" params.bed="" @@ -37,7 +35,7 @@ params.config="" //glam2 params.motif_min_len = 8 // Minimum length of Motifs params.motif_max_len = 20 // Maximum length of Motifs - params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals higher more accurate but higher runtime. + params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. @@ -49,18 +47,54 @@ params.config="" if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ log.info """ -NAME -Usage: nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] +Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] + +Required arguments: + --input Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --jaspar_db Path to motif-database in MEME-format + + +Optional arguments: + Footprint extraction: + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT(Default: 0) + + Filter unknown motifs: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + --sequence_coverage INT (Default: 8) + --kmer INT (Default: 10) + --aprox_motif_len INT (Default: 10) + + Motif estimation: + --motif_min_len INT Minimum length of Motif (Default: 8) + --motif_max_len INT Maximum length of Motif (Default: 20) + --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + + Creating GTF: + --organism [homo_sapiens | mus_musculus] + --tissues + +All arguments can be set in the configuration files. """ } +bigwig_input.combine(bed_input).set{footprint_in} + + process footprint_extraction { conda "${path_env}" tag{name} - publishDir '${out}', mode: 'copy', pattern: '*.bed' - publishDir '/mnt/agnerds/Rene.Wiegandt/log', mode: 'copy', pattern: '*.log' + publishDir "${out}", mode: 'copy', pattern: '*.bed' + publishDir "${out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -257,7 +291,7 @@ process get_best_motif { best_motif.combine(fa_scan).set {files_for_genome_scan} - +/* process genome_scan { conda "${path_env}" @@ -284,7 +318,7 @@ process cluster_quality { script: """ """ -} +} */ process create_GTF { @@ -301,7 +335,7 @@ process create_GTF { """ } - +/* bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} @@ -348,4 +382,4 @@ process filter { script: """ """ -} +} */ From 688cb007bdb097a8a5c00c18a5986c8c4d05397c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:02:21 +0100 Subject: [PATCH 058/158] Update README.md --- README.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/README.md b/README.md index 4cf5660..d746944 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,43 @@ source activate masterenv nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config ``` ## Parameter +``` +Required arguments: + --input Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --jaspar_db Path to motif-database in MEME-format + + +Optional arguments: + Footprint extraction: + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT(Default: 0) + + Filter unknown motifs: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + --sequence_coverage INT (Default: 8) + --kmer INT (Default: 10) + --aprox_motif_len INT (Default: 10) + + Motif estimation: + --motif_min_len INT Minimum length of Motif (Default: 8) + --motif_max_len INT Maximum length of Motif (Default: 20) + --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + + Creating GTF: + --organism [homo_sapiens | mus_musculus] + --tissues + + All arguments can be set in the configuration files. + ``` + + For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) From 65e17ce0d85d483e878696b7d014bb6638aa031d Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:02:54 +0100 Subject: [PATCH 059/158] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d746944..f457df8 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ source activate masterenv ```console nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config ``` -## Parameter +## Parameters ``` Required arguments: --input Path to BigWig-file From 0bdc1016eba520a8a371600db9cf57c82c9348b2 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:03:35 +0100 Subject: [PATCH 060/158] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f457df8..6431022 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ source activate masterenv ## Quick Start ```console -nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config +nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters ``` From a62154ca9c57826cf163599ec5b27b8ac40cbd25 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:34:44 +0100 Subject: [PATCH 061/158] Update README.md --- README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6431022..f36beca 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,23 @@ De novo motif discovery and evaluation based on footprints identified by TOBIAS ## Installation Start with installing all dependencies listed above. Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -The Nextflow-script needs a conda enviroment to run. To create this enviroment you need the yml-file from the repository. +The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. +On some systems Nrxtflow exits the run with following error: +``` +Caused by: + Failed to create Conda environment + command: conda env create --prefix --file env.yml + status : 143 + message: +``` +If this error occurs you have to create the enviroment before starting the pipeline. +To create this enviroment you need the yml-file from the repository. Run the following commands to create the enviroment: ```console path=[Path to given masterenv.yml file] conda env create --name masterenv -f=$path -source activate masterenv ``` +When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. ## Quick Start ```console From 0467d7b581680773dd23ea119cfd710897136182 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 11:31:00 +0100 Subject: [PATCH 062/158] added more cdhit parameter --- bin/cdhit_wrapper.R | 126 ++++++++++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 35 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index f26a71a..9c62564 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -3,53 +3,77 @@ library("optparse") option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default", metavar = "double >= 0.8"), - make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - # make_option(opt_str = c("-G", "--global"), default = FALSE), - # make_option(opt_str = c("-b", "--band_width"), default = 20), - # make_option(opt_str = c("-M", "--memory"), default = 800), - make_option(opt_str = c("-T", "--threads"), default = 1)#, - # make_option(opt_str = c("-n", "--word_length"), default = 3), - # make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5), + make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 - # make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0), - # make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999), - # make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0), - # make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999), - # make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0), - # make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999), - # make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0), - # make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 99999999), - # make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999), - # make_option(opt_str = c("-g", "--fast_cluster"), default = FALSE), - # make_option(opt_str = c("-r", "--ignore_strand"), default = TRUE), - # make_option(opt_str = c("--match"), default = 2), - # make_option(opt_str = c("--mismatch"), default = -2), - # make_option(opt_str = c("--gap"), default = -6), - # make_option(opt_str = c("--gap_ext"), default = -1), - # make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = TRUE) - # TODO more args + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--ignore_strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, - description = "CD-HIT-EST Wrapper function.") + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") opt <- parse_args(opt_parser) -#' cd-hit wrapper +#' cd-hit-est wrapper +#' Receives bed with sequences in last column and writes bed with added cluster numbers. #' #' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). -#' @param identity Identity threshold. -#' @param coverage Minimal alignment length for both sequences in nucelotides. -#' @param output Clustered bedfile. Adds cluster number in last column (lower number = bigger cluster). -#' @param clean Clean up after run. -#' @param threads Number of threads to use in cd-hit (0 = all cpus). +#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) +#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param clean Clean up after run. Default = TRUE +#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) +#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) +#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" +#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) +#' @param word_length Default = 3 (CD-HIT parameter) +#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) +#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) +#' @param ignore_strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param match Matching score. Default = 2 (CD-HIT parameter) +#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) +#' @param gap Gap score. Default = -6 (CD-HIT parameter) +#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' -#' TODO add all cdhit parameter (necessary ?) #' TODO check whether cdhit is installed -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, ignore_strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + if (missing(input)) { + stop("Input parameter missing!") + } + message("Loading bed.") # load bed if neccessary if (!data.table::is.data.table(input)) { @@ -58,6 +82,12 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" bed_table <- input } + # check for duplicated names + if (anyDuplicated(bed_table[, "V4"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, V4 := make.unique(V4)] + } + message("Convert to fasta.") ### convert bed to fasta # 4th column = name @@ -68,7 +98,33 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" message("Clustering.") ### cd-hit-est cdhit_output <- "cdhit_output" - cdhit_call <- paste("cd-hit-est -i", fasta_file, "-o", cdhit_output, "-c", identity, "-A", coverage, "-T ", threads, "-G 0 -n 3 -g 1 -r 0 -l 5 -sc 1 -d 0") + cdhit_call <- paste("cd-hit-est -i", fasta_file, + "-o", cdhit_output, + "-c", identity, + "-A", coverage, + "-T", threads, + "-G", global, + "-b", band_width, + "-M", memory, + "-n", word_length, + "-l", throw_away_sequences, + "-s", length_dif_cutoff_shorter_p, + "-S", length_dif_cutoff_shorter_n, + "-aL", alignment_coverage_longer_p, + "-AL", alignment_coverage_longer_n, + "-aS", alignment_coverage_shorter_p, + "-AS", alignment_coverage_shorter_n, + "-uL", max_unmatched_longer_p, + "-uS", max_unmatched_shorter_p, + "-U", max_unmatched_both_n, + "-g", fast_cluster, + "-r", ignore_strand, + "-match", match, + "-mismatch", mismatch, + "-gap", gap, + "-gap-ext", gap_ext, + "-sc", sort_cluster_by_size, + "-d 0") system(command = cdhit_call, wait = TRUE) From 4b44ca59e7b7cb0b06c44352eb15677cc5020511 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 12:15:38 +0100 Subject: [PATCH 063/158] changed type of motif_occurence to double --- bin/reduce_bed.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index c022b43..efe5bf5 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -11,7 +11,7 @@ option_list <- list( make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), - make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.") + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") ) opt_parser <- OptionParser(option_list = option_list, From c6c42ca1e19e6a3f838a3960cd732e86ae96ff21 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 12:30:21 +0100 Subject: [PATCH 064/158] renamed ignore_strand to strand --- bin/cdhit_wrapper.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index 9c62564..c9a8241 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -24,7 +24,7 @@ option_list <- list( make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-r", "--ignore_strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -61,7 +61,7 @@ opt <- parse_args(opt_parser) #' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) #' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) #' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) -#' @param ignore_strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) #' @param match Matching score. Default = 2 (CD-HIT parameter) #' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) #' @param gap Gap score. Default = -6 (CD-HIT parameter) @@ -69,7 +69,7 @@ opt <- parse_args(opt_parser) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) #' #' TODO check whether cdhit is installed -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, ignore_strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { if (missing(input)) { stop("Input parameter missing!") } @@ -118,7 +118,7 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" "-uS", max_unmatched_shorter_p, "-U", max_unmatched_both_n, "-g", fast_cluster, - "-r", ignore_strand, + "-r", strand, "-match", match, "-mismatch", mismatch, "-gap", gap, From b769d49f6506b47915bf32821a00aed1603f9f82 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 12:43:58 +0100 Subject: [PATCH 065/158] new clustering parameter added --- config/cluster.config | 14 ++++++++++++-- pipeline.nf | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/config/cluster.config b/config/cluster.config index 3dce063..2a1828c 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -1,7 +1,17 @@ params{ - sequence_coverage=8 + threads=1 + //reduce_bed kmer=10 aprox_motif_len=10 - threads=1 + motif_occurence=1 + min_seq_length=10 + + //cdhit_wrapper + global=0 + identity=0.8 + sequence_coverage=8 + memory=800 + throw_away_seq=9 + strand=0 } diff --git a/pipeline.nf b/pipeline.nf index 5ed2eac..1fabdc3 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -75,7 +75,7 @@ process reduce_bed { script: """ - Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} + Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ } @@ -91,7 +91,7 @@ process clustering { script: """ - Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed + Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} """ } From 2700ce7904b7594d7b52553cb291e3fb4af6a75e Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 13:14:46 +0100 Subject: [PATCH 066/158] Update README.md --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f36beca..994029a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,14 @@ De novo motif discovery and evaluation based on footprints identified by TOBIAS * [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web) ## Installation -Start with installing all dependencies listed above. +Start with installing all dependencies listed above. It is required to set the [enviroment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar). +this can be done with following commands: +``` +export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH +export PATH=[meme-suite instalation path]/bin:$PATH +``` + + Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. On some systems Nrxtflow exits the run with following error: From 1a457039b1aff1cff51256fd4117b4a9cbd74866 Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 13:55:35 +0100 Subject: [PATCH 067/158] fixed ambiguous flags --- bin/cdhit_wrapper.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R index c9a8241..e5ed3e4 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/cdhit_wrapper.R @@ -16,12 +16,12 @@ option_list <- list( # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-aL", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter)", metavar = "double"), - make_option(opt_str = c("-AL", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-aS", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter)", metavar = "double"), - make_option(opt_str = c("-AS", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-uL", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), - make_option(opt_str = c("-uS", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), + make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), + make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"), + make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), + make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), + make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -29,7 +29,7 @@ option_list <- list( make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-sc", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter)", metavar = "integer") + make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer") ) opt_parser <- OptionParser(option_list = option_list, From cd53d3c4dc58f4eaadfdbe556dbb8532c2ea46af Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 13:56:20 +0100 Subject: [PATCH 068/158] fixed ambiguous flags + nameing bug in find_kmer_regions --- bin/reduce_bed.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R index efe5bf5..2f8e8a2 100644 --- a/bin/reduce_bed.R +++ b/bin/reduce_bed.R @@ -9,8 +9,8 @@ option_list <- list( make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), - make_option(opt_str = c("-ko", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), - make_option(opt_str = c("-mo", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") ) @@ -196,6 +196,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { seq <- bed[x][[ncol(bed)]] + name <- bed[x][[4]] #### locate ranges ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) @@ -203,7 +204,7 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) ranges <- na.omit(ranges, cols = c("start", "end")) if (nrow(ranges) < 1) { - return(data.table::data.table(start = NA, end = NA, width = NA)) + return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) } # add kmer sequences @@ -237,14 +238,13 @@ find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) # reduce selected ranges reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) - reduced_ranges <- data.table::as.data.table(reduced_ranges) + reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name] return(reduced_ranges) }) # create ranges table conserved_regions_table <- data.table::rbindlist(seq_ranges) - conserved_regions_table[, name := bed[[4]]] return(conserved_regions_table) } From e68a89dcabd401a8108c86be987433b55acaebce Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 13:57:27 +0100 Subject: [PATCH 069/158] removed param threads --- config/cluster.config | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/cluster.config b/config/cluster.config index 2a1828c..1b444e2 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -1,6 +1,4 @@ params{ - threads=1 - //reduce_bed kmer=10 aprox_motif_len=10 From 92baa4d08228b23ff96972a0cbc9a54c3c92a59d Mon Sep 17 00:00:00 2001 From: Schultheis Date: Thu, 6 Dec 2018 14:32:46 +0100 Subject: [PATCH 070/158] added parameter to list and description --- pipeline.nf | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 0ebc301..f12edad 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -23,10 +23,19 @@ params.config="" params.max_size_fp=100 //clustering - params.sequence_coverage=8 - - params.kmer=10 - params.aprox_motif_len=10 + //reduce_bed + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 //motif_estimation //bed_to_clustered_fasta @@ -67,9 +76,19 @@ Optional arguments: --max_size_fp INT (Default: 100) Clustering: - --sequence_coverage INT (Default: 8) - --kmer INT (Default: 10) - --aprox_motif_len INT (Default: 10) + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length INT Remove all sequences below this value. (Default: 10) + + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) From 9b09e5a6e736a1fd5c178af81afad714fa784575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 08:40:02 -0500 Subject: [PATCH 071/158] minor changes --- masterenv.yml | 5 +---- nextflow.config | 19 ++++++++++--------- pipeline.nf | 32 ++++++++++++++++++++++++-------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/masterenv.yml b/masterenv.yml index b25f948..a977fe1 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -5,11 +5,9 @@ channels: - conda-forge dependencies: - bedtools - - python - r-seqinr - numpy - pybigWig - #clustering - cd-hit - jellyfish - r-base>=3.5.1 @@ -20,11 +18,10 @@ dependencies: - r-stringr - r-optparse - bioconductor-iranges -# - snakemake - meme - moods - biopython - pybedtools - matplotlib - - seaborn + - seaborn \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index a3c06eb..47dc610 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,14 +1,15 @@ +wd = "/mnt/agnerds/Rene.Wiegandt/10_Master/masterJLU2018" createTimeout = 60 params.threads=1 //Parameter for for scripts! Not for nextflow processes. -params.config="${workflow.projectDir}/config/uropa.config" - -includeConfig '$workflow.projectDir/config/peak_calling.config' -includeConfig '$workflow.projectDir/config/filter_unknown_motifs.config' -includeConfig '$workflow.projectDir/config/cluster.config' -includeConfig '$workflow.projectDir/config/motif_estimation.config' -includeConfig '$workflow.projectDir/config/create_gtf.config' +params.config="${wd}/config/uropa.config" env { - path_bin = $workflow.projectDir/bin - path_env = $workflow.projectDir/masterenv.yml + path_bin = "${wd}/bin" + path_env = "${wd}/masterenv.yml" } + +includeConfig "${wd}/config/peak_calling.config" +includeConfig "${wd}/config/filter_unknown_motifs.config" +includeConfig "${wd}/config/cluster.config" +includeConfig "${wd}/config/motif_estimation.config" +includeConfig "${wd}/config/create_gtf.config" diff --git a/pipeline.nf b/pipeline.nf index e694ba0..13c6718 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -86,9 +86,10 @@ All arguments can be set in the configuration files. } -bigwig_input.combine(bed_input).set{footprint_in} - +bigwig_input.combine(bed_input).into {footprint_in} +/* +*/ process footprint_extraction { conda "${path_env}" @@ -103,12 +104,16 @@ process footprint_extraction { set name, file ('*.bed') into bed_for_overlap_with_TFBS script: + print(bigWig) """ python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} """ } //Abfrage ob ausgeführt werden muss. +/* + +*/ process extract_known_TFBS { conda "${path_env}" @@ -129,6 +134,8 @@ process extract_known_TFBS { bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap} +/* +*/ process overlap_with_known_TFBS { conda "${path_env}" @@ -146,6 +153,8 @@ process overlap_with_known_TFBS { } +/* +*/ process reduce_bed { conda "${path_env}" @@ -162,6 +171,8 @@ process reduce_bed { } +/* +*/ process clustering { conda "${path_env}" @@ -178,7 +189,9 @@ process clustering { } -// Converting BED-File to one FASTA-File per cluster +/* +Converting BED-File to one FASTA-File per cluster +*/ process bed_to_clustered_fasta { conda "${path_env}" @@ -202,8 +215,10 @@ process bed_to_clustered_fasta { fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} -//Running GLAM2 on FASTA-files. -//Generating Motifs through alignment and scoring best local matches. +/* +Running GLAM2 on FASTA-files. +Generating Motifs through alignment and scoring best local matches. +*/ process glam2 { conda "${path_env}" @@ -221,9 +236,10 @@ process glam2 { """ } - -//Running Tomtom on meme-files generated by GLAM2. -//Tomtom searches motifs in databases. +/* +Running Tomtom on meme-files generated by GLAM2. +Tomtom searches motifs in databases. +*/ process tomtom { conda "${path_env}" From a8377ce8dce9c8b5b89524ff1d089bd800574f90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 08:41:33 -0500 Subject: [PATCH 072/158] removed debug code --- pipeline.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index ff94d9d..00c23a1 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -81,7 +81,7 @@ Optional arguments: --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length INT Remove all sequences below this value. (Default: 10) - + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -123,7 +123,6 @@ process footprint_extraction { set name, file ('*.bed') into bed_for_overlap_with_TFBS script: - print(bigWig) """ python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} """ From 224b6eef2e8f00e47a3db230d6ef67606de97222 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 6 Dec 2018 14:41:38 +0100 Subject: [PATCH 073/158] bugfixing --- bin/compareBed.sh | 127 ++++++++++++++++++++++++++++++++++++++-------- bin/maxScore.R | 7 +-- bin/merge.R | 23 ++++++--- 3 files changed, 127 insertions(+), 30 deletions(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index bcf52fb..22c288a 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -100,19 +100,20 @@ then echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." echo "If a parameter is chosen, a value must be provided or an error will occur." echo "--------------------" - echo "usage: compareBed.sh --data \ --motifs \ --fasta \ \[optional_parameter \\] ..." + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints\; default is 10" - echo " -max --max maximum size of footprints\; default is 60" - echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" exit 0 fi @@ -137,17 +138,17 @@ fi if [ $wo == false ] then - if [ ! -d workdir1337 ] - then - mkdir workdir1337 - fi + #if [ ! -d workdir1337 ] + #then + # mkdir workdir1337 + #fi wo=true - workdir="workdir1337" + workdir="." fi if [ $ou == false ] then - output="newFootprints.bed" + output=${workdir}/"newMotifs.bed" ou=true fi @@ -159,25 +160,109 @@ fi if [ $ma == false ] then - max=60 + max=80 ma=true fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + #1. first filter. no overlap vs. overlap -bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true + +if [ -d "$motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l + +echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed +Rscript --vanilla /bin/maxScore.R "$workdir"/pass1Fa.bed + +#3 subtract overlapping regions for additional motifs +echo "also get subsequences with no overlap of overlapping sequences" +help=true -#3. subtract overlapping regions for additional motifs -bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + + +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla merge.R $min $max "$workdir" +Rscript --vanilla /bin/merge.R $min $max "$workdir" $data #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file +#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/bin/maxScore.R b/bin/maxScore.R index 4ea545b..4347af2 100644 --- a/bin/maxScore.R +++ b/bin/maxScore.R @@ -1,9 +1,10 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args = commandArgs(TRUE) file = args[1] -tab = read.table(file, header=FALSE) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +tab = fread(file) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") tab$maxpos = tab$start + tab$maxpos write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge.R b/bin/merge.R index bbe3c9a..728c9cf 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -1,12 +1,13 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -19,5 +20,15 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +before = fread(args[4], header=FALSE) + +sumb=sum(before$V3-before$V2) +suma=sum(splitted$length) +difference = formatC(sumb/suma, digits=4) +loss = formatC(1 - suma/sumb, digits=2) +lengthb = formatC(mean(before$V3-before$V2), digits=4) +lengtha = formatC(mean(splitted$length), digits=4) +stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) +write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From 0eb48b5b316304611c643adaec188450deabfa3e Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 14:47:32 +0100 Subject: [PATCH 074/158] Update README.md --- README.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 994029a..da59386 100644 --- a/README.md +++ b/README.md @@ -58,10 +58,19 @@ Optional arguments: --min_size_fp INT (Default: 10) --max_size_fp INT (Default: 100) + Cluster: + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length INT Remove all sequences below this value. (Default: 10) Clustering: - --sequence_coverage INT (Default: 8) - --kmer INT (Default: 10) - --aprox_motif_len INT (Default: 10) + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) From 119b876fff76f8bee8377a8a8c4c0ef96461b392 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 6 Dec 2018 15:12:06 +0100 Subject: [PATCH 075/158] fixing --- bin/compareBed.sh | 90 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index 22c288a..f594b6a 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -113,7 +113,11 @@ then echo " default is a new directory that is created in the current directory" echo " -min --min minimum size of footprints; default is 10" echo " -max --max maximum size of footprints; default is 80" +<<<<<<< HEAD echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" +======= + echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e echo " -h --help shows this help message" exit 0 fi @@ -148,7 +152,11 @@ fi if [ $ou == false ] then +<<<<<<< HEAD output=${workdir}/"newMotifs.bed" +======= + output=${workdir}/"newFootprints.bed" +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e ou=true fi @@ -173,6 +181,7 @@ fi echo get sequences with no overlap cat $data > "$workdir"/pass1Tr.bed help=true +<<<<<<< HEAD if [ -d "$motifs" ] then @@ -204,6 +213,39 @@ do done fi +======= + +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi + +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e if [ $help == false ] then cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed @@ -214,11 +256,48 @@ cat "$workdir"/pass1Fa.bed | wc -l echo calc maxScore #2. compute absolut maxscore position +<<<<<<< HEAD Rscript --vanilla /bin/maxScore.R "$workdir"/pass1Fa.bed +======= +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" help=true +<<<<<<< HEAD + +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + + +======= if [ -d $"motifs" ] then @@ -250,6 +329,7 @@ done fi +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e if [ $help == false ] then cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed @@ -258,11 +338,19 @@ else fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +<<<<<<< HEAD Rscript --vanilla /bin/merge.R $min $max "$workdir" $data +======= +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e #5. add fasta sequences to bed and create fasta file bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file +<<<<<<< HEAD +#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed +======= +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e From b8f435fb5bd8fad7aa54e9d45fe0bd0dca77427e Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 6 Dec 2018 15:12:24 +0100 Subject: [PATCH 076/158] fixing --- bin/maxScore.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/maxScore.R b/bin/maxScore.R index 4347af2..c0d4cb1 100644 --- a/bin/maxScore.R +++ b/bin/maxScore.R @@ -3,8 +3,13 @@ library(data.table) args = commandArgs(TRUE) file = args[1] +<<<<<<< HEAD tab = fread(file) colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +======= +tab = fread(file, header=FALSE) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e tab$maxpos = tab$start + tab$maxpos -write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From df70d2929424ed9ffd3de7c95d9ee85b319ca658 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 6 Dec 2018 15:12:37 +0100 Subject: [PATCH 077/158] fixing --- bin/merge.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bin/merge.R b/bin/merge.R index 728c9cf..b302d1f 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -4,10 +4,17 @@ args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] +<<<<<<< HEAD splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +======= +splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -21,6 +28,7 @@ splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +<<<<<<< HEAD before = fread(args[4], header=FALSE) @@ -32,3 +40,5 @@ lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') +======= +>>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e From ff3570e8c9227d9eeac05ccde9acc440fe022057 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 15:20:01 +0100 Subject: [PATCH 078/158] fixed failed merge --- bin/merge.R | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/bin/merge.R b/bin/merge.R index b302d1f..c2015aa 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -4,17 +4,12 @@ args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -<<<<<<< HEAD + splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") -======= -splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e + p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -28,7 +23,6 @@ splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') -<<<<<<< HEAD before = fread(args[4], header=FALSE) @@ -40,5 +34,4 @@ lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') -======= ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e + From 4988fabe476b84fc5de2ca98074b6d7cb48f118d Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 15:20:36 +0100 Subject: [PATCH 079/158] fixed failed merge --- bin/maxScore.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/bin/maxScore.R b/bin/maxScore.R index c0d4cb1..6b90984 100644 --- a/bin/maxScore.R +++ b/bin/maxScore.R @@ -3,13 +3,9 @@ library(data.table) args = commandArgs(TRUE) file = args[1] -<<<<<<< HEAD tab = fread(file) colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") -======= -tab = fread(file, header=FALSE) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e + tab$maxpos = tab$start + tab$maxpos fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From db937a17f1af548dc0a1be96b9bb3a3086e88326 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 15:25:35 +0100 Subject: [PATCH 080/158] fixed failed merge --- bin/compareBed.sh | 93 ++--------------------------------------------- 1 file changed, 4 insertions(+), 89 deletions(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index f594b6a..b476a0d 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -113,11 +113,7 @@ then echo " default is a new directory that is created in the current directory" echo " -min --min minimum size of footprints; default is 10" echo " -max --max maximum size of footprints; default is 80" -<<<<<<< HEAD echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" -======= - echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e echo " -h --help shows this help message" exit 0 fi @@ -152,11 +148,7 @@ fi if [ $ou == false ] then -<<<<<<< HEAD output=${workdir}/"newMotifs.bed" -======= - output=${workdir}/"newFootprints.bed" ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e ou=true fi @@ -181,7 +173,6 @@ fi echo get sequences with no overlap cat $data > "$workdir"/pass1Tr.bed help=true -<<<<<<< HEAD if [ -d "$motifs" ] then @@ -213,39 +204,6 @@ do done fi -======= - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -fi - ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e if [ $help == false ] then cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed @@ -256,16 +214,12 @@ cat "$workdir"/pass1Fa.bed | wc -l echo calc maxScore #2. compute absolut maxscore position -<<<<<<< HEAD + Rscript --vanilla /bin/maxScore.R "$workdir"/pass1Fa.bed -======= -Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" help=true -<<<<<<< HEAD if [ -d $"motifs" ] then @@ -296,40 +250,6 @@ do done fi - -======= - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -fi - - ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e if [ $help == false ] then cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed @@ -338,19 +258,14 @@ else fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -<<<<<<< HEAD + Rscript --vanilla /bin/merge.R $min $max "$workdir" $data -======= -Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e + #5. add fasta sequences to bed and create fasta file bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -<<<<<<< HEAD #rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed -======= -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed ->>>>>>> 0eb48b5b316304611c643adaec188450deabfa3e + From 9c3f325f196a702183ae5a36caf7a9f182b0160d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 10:08:16 -0500 Subject: [PATCH 081/158] minor fix --- pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index 00c23a1..000230f 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -105,7 +105,7 @@ All arguments can be set in the configuration files. } -bigwig_input.combine(bed_input).into {footprint_in} +bigwig_input.combine(bed_input).set{footprint_in} /* */ From cc88f6c11a7e0d0eae583124301ef5faec1f3e99 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Fri, 7 Dec 2018 00:23:10 +0100 Subject: [PATCH 082/158] added process merge_meme --- pipeline.nf | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pipeline.nf b/pipeline.nf index 00c23a1..656a4ae 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -254,6 +254,26 @@ process glam2 { """ } +/* +process merge_meme { + + input: + val (meme_list) from meme_list + + output: + file ('merged_meme.meme') into merged_meme + + script: + memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } + meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } + meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") + """ + meme2meme ${memelist} > merged_meme.meme + """ +} +*/ + /* Running Tomtom on meme-files generated by GLAM2. Tomtom searches motifs in databases. From deca64b45d163d43b2ec61e8fb75db08c8c2c484 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 8 Dec 2018 00:40:24 +0100 Subject: [PATCH 083/158] removed typo --- bin/bed_to_fasta.R | 56 +++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 09e3f4f..13bd026 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,28 +1,28 @@ -#!/usr/bin/env Rscript - -# Splitting BED-files depending on their cluster. -# The Sequences of each cluster are writen as an FASTA-file. -# @parameter bedInput BED-file with sequences and cluster-id as column"TEs -# @parameter prefix prefix for filenames -# @parameter min_seq min. number of sequences per cluster - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - print(nrow(clust)) - if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) +#!/usr/bin/env Rscript + +# Splitting BED-files depending on their cluster. +# The Sequences of each cluster are writen as an FASTA-file. +# @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 +# @parameter prefix prefix for filenames +# @parameter min_seq min. number of sequences per cluster + +args = commandArgs(trailingOnly = TRUE) + +bedInput <- args[1] +prefix <- args[2] +min_seq <- args[3] + +bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") + +clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + print(nrow(clust)) + if (nrow(clust) >= as.numeric(min_seq) ) { + sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz + outfile <- paste0(prefix,"_cluster_",i,".FASTA") + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + } else { + print(paste0("Cluster: ",i," is to small")) + } +}) From b784574791cf8ec8d782d2013b04fb35f66980b2 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 8 Dec 2018 00:41:38 +0100 Subject: [PATCH 084/158] setting up prozesses for motif clustering --- pipeline.nf | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 656a4ae..9b2e67a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -220,7 +220,7 @@ process bed_to_clustered_fasta { set name, file (bed) from bed_for_motif_esitmation output: - file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_glam2, fasta_for_motif_cluster script: """ @@ -231,7 +231,8 @@ process bed_to_clustered_fasta { //flatten list and adding name of file to channel value fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} - +//combine fasta files in one list +fasta_for_motif_cluster = fasta_for_motif_cluster.toList() /* Running GLAM2 on FASTA-files. @@ -255,6 +256,8 @@ process glam2 { } /* +Combining alle MEME-files to one big MEME-file. +The paths are sorted numerically depending on the cluster number. process merge_meme { input: @@ -272,6 +275,31 @@ process merge_meme { meme2meme ${memelist} > merged_meme.meme """ } + + +process find_similar_motifs { + + input: + + output: + + script: + """ + """ +} + + +process merge_fasta { + + input: + + + output: + + script: + """ + """ +} */ /* From bff581741f05ddafbda400b763f87e63c40a6e5f Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 8 Dec 2018 00:42:03 +0100 Subject: [PATCH 085/158] added script merge_similar_clusters.R --- bin/merge_similar_clusters.R | 55 ++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 bin/merge_similar_clusters.R diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R new file mode 100644 index 0000000..f410fa5 --- /dev/null +++ b/bin/merge_similar_clusters.R @@ -0,0 +1,55 @@ +#!/usr/bin/env Rscript + +# Merging FASTA-files, which motifs are similar. +# +# @parameter tsv_in Path to TSV file generated by Tomtom. +# The input for Tomtom is a from all clusters merged meme-file. +# @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths +# @parameter min_weight Minimum weight of edge allowed in graph clusters. + + +args = commandArgs(trailingOnly = TRUE) + +tsv_in <- args[1] +file_list <- args[2] +min_weight <- args[3] + +files <- unlist(as.list(strsplit(file_list, " "))) + +# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. +tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') +query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ + tail(l,n=1) +})) +target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ + tail(l,n=1) +})) + +# create data.table with only the cluster-numbers +sim_not_unique <- data.table::data.table(query_cluster,target_cluster) +# convert from character to numeric values +sim_not_unique[, query_cluster := as.numeric(query_cluster)] +sim_not_unique[, target_cluster := as.numeric(target_cluster)] + +# remove rows if column 1 is idential to column 2 +edgelist <- sim_not_unique[query_cluster != target_cluster] + +# create graph from data.frame +g <- igraph::graph_from_edgelist(as.matrix(g)) +# converting graph to adjacency matrix +adj_matrix <- igraph::get.adjacency(g, names = T) +# generating weighted graph from adjacency matrix +g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + +# get subgraphs from graph with edges of weight > min_weight +s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) + +clust <- clusters(s1) + +# merge FASTA-files depending on the clustered graphs +a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + cl <- as.vector(which(clust$membership %in% c(i))) + fasta_list <- paste(files[cl], collapse = " ") + name <- paste0("Cluster_",i,".fasta") + system(paste("cat",fasta_list,">",name)) +}) From 10a7e8cb84cf101922bebc882ad9add8290dc2f9 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 8 Dec 2018 22:47:23 +0100 Subject: [PATCH 086/158] added motif clustering --- bin/get_best_motif.py | 9 ++-- bin/merge_similar_clusters.R | 10 +++-- config/motif_estimation.config | 5 +++ pipeline.nf | 78 +++++++++++++++++++++++++++------- 4 files changed, 79 insertions(+), 23 deletions(-) diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py index 770c8a8..0e7334e 100644 --- a/bin/get_best_motif.py +++ b/bin/get_best_motif.py @@ -3,7 +3,8 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument("meme", help="Path to meme file") - parser.add_argument("output", help="") + parser.add_argument("output", help="Output file") + parser.add_argument("num", help="Number of motifs parsed from file") args = parser.parse_args() return args @@ -11,13 +12,15 @@ def parse_arguments(): def main(): args = parse_arguments() out = open(args.output, "w+") + number = int(args.num) + 1 + motif = "MOTIF " + str(number) with open(args.meme) as f: for line in f: - if 'MOTIF 2' in line: + if motif in line: break out.write(line) if __name__ == "__main__": import argparse - main() \ No newline at end of file + main() diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R index f410fa5..e72fa03 100644 --- a/bin/merge_similar_clusters.R +++ b/bin/merge_similar_clusters.R @@ -14,7 +14,7 @@ tsv_in <- args[1] file_list <- args[2] min_weight <- args[3] -files <- unlist(as.list(strsplit(file_list, " "))) +files <- unlist(as.list(strsplit(file_list, ","))) # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') @@ -35,7 +35,7 @@ sim_not_unique[, target_cluster := as.numeric(target_cluster)] edgelist <- sim_not_unique[query_cluster != target_cluster] # create graph from data.frame -g <- igraph::graph_from_edgelist(as.matrix(g)) +g <- igraph::graph_from_edgelist(as.matrix(edgelist)) # converting graph to adjacency matrix adj_matrix <- igraph::get.adjacency(g, names = T) # generating weighted graph from adjacency matrix @@ -43,8 +43,10 @@ g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) # get subgraphs from graph with edges of weight > min_weight s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) - -clust <- clusters(s1) +png('motif_clusters.png') +plot(s1) +dev.off() +clust <- igraph::clusters(s1) # merge FASTA-files depending on the clustered graphs a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ diff --git a/config/motif_estimation.config b/config/motif_estimation.config index d280b3e..d12e13c 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -9,4 +9,9 @@ params { //tomtom tomtom_treshold = 0.01 + + //motif Clustering + edge_weight = 5 + motif_similarity_thresh = 0.00001 + best_motif = 3 } diff --git a/pipeline.nf b/pipeline.nf index 9b2e67a..b1084df 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -49,6 +49,12 @@ params.config="" //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. + //cluster motifs + params.edge_weight = 5 // Minimum weight of edges in motif-cluster-graph + motif_similarity_thresh = 0.00001 // threshold for motif similarity score + + params.best_motif = 3 // Top n motifs per cluster + //creating_gtf params.organism="homo_sapiens" params.tissue="" @@ -96,6 +102,10 @@ Optional arguments: --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + Moitf clustering: + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + Creating GTF: --organism [homo_sapiens | mus_musculus] --tissues @@ -211,16 +221,18 @@ process clustering { Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { - conda "${path_env}" tag{name} - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' input: - set name, file (bed) from bed_for_motif_esitmation + set name, file (bed) from clustered_bed + + when: + params.fasta == false output: - file ('*.FASTA') into fasta_for_glam2, fasta_for_motif_cluster + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster script: """ @@ -239,7 +251,6 @@ Running GLAM2 on FASTA-files. Generating Motifs through alignment and scoring best local matches. */ process glam2 { - conda "${path_env}" tag{name} @@ -247,7 +258,7 @@ process glam2 { set name, file (fasta) from fasta_for_glam2 output: - set name, file('*.meme') into meme_for_tomtom, meme_for_filter + file("${name}/*.meme") into meme_to_merge script: """ @@ -256,12 +267,13 @@ process glam2 { } /* -Combining alle MEME-files to one big MEME-file. +Combining all MEME-files to one big MEME-file. The paths are sorted numerically depending on the cluster number. +*/ process merge_meme { input: - val (meme_list) from meme_list + val (memelist) from meme_to_merge.toList() output: file ('merged_meme.meme') into merged_meme @@ -272,34 +284,71 @@ process merge_meme { meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") """ - meme2meme ${memelist} > merged_meme.meme + meme2meme ${meme_list} > merged_meme.meme """ } - +/* +Running Tomtom on merged meme-files. +Output table has the information which clusters are similar to each other. +*/ process find_similar_motifs { + input: + file (merged_meme) from merged_meme output: + file ('all_to_all.tsv') into motif_similarity script: """ + tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv """ } +files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + + process merge_fasta { input: + set file (motiv_sim), val (fasta_list) from files_for_merge_fasta + + output: + file ('*.fasta') into motif_clustered_fasta_list + file('*.png') + + script: + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + """ + Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + """ +} + +motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() + + +process clustered_glam2 { + + input: + file (fasta) from motif_clustered_fasta_flat output: + set name, file ('*.meme') into clustered_meme_for_tomtom + set name, file ('*.meme') into clustered_meme_for_filter + file('*') script: + name = fasta.getBaseName() """ + glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} """ } + */ /* @@ -307,21 +356,18 @@ Running Tomtom on meme-files generated by GLAM2. Tomtom searches motifs in databases. */ process tomtom { - conda "${path_env}" tag{name} - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' - input: - set name, file (meme), file (jaspar_db) from meme_for_tomtom.combine(db_for_tomtom) + set name, file (meme) from clustered_meme_for_tomtom output: set name, file ('*.tsv') into tsv_for_filter script: """ - tomtom ${meme} ${jaspar_db} -thresh ${params.tomtom_treshold} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + tomtom ${meme} ${jaspar_db} -thresh ${params.thresh} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv """ } @@ -366,7 +412,7 @@ process get_best_motif { script: """ - python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme + python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} """ } From e0bc519c4e13140a7d3762c626d5285e5f458f67 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 8 Dec 2018 22:50:18 +0100 Subject: [PATCH 087/158] Update README.md --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index da59386..28ae357 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ Optional arguments: --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length INT Remove all sequences below this value. (Default: 10) + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -78,11 +79,15 @@ Optional arguments: --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + Moitf clustering: + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + Creating GTF: --organism [homo_sapiens | mus_musculus] --tissues - - All arguments can be set in the configuration files. + +All arguments can be set in the configuration files. ``` From 889899f546543fa2f946e4e96618422eda16ee8c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 8 Dec 2018 22:51:33 +0100 Subject: [PATCH 088/158] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 28ae357..02f4ec4 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ De novo motif discovery and evaluation based on footprints identified by TOBIAS +For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) + ## Dependencies * [conda](https://conda.io/docs/user-guide/install/linux.html) * [Nextflow](https://www.nextflow.io/) From 250ee928d998f7112cf419ae80122bada284fb32 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Mon, 10 Dec 2018 09:36:31 +0100 Subject: [PATCH 089/158] Update compareBed.sh --- bin/compareBed.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index b476a0d..ee65fb4 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -96,7 +96,7 @@ if [ $he == true ] then echo "This script utilies bedtools to select new footprints from data." echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information." + echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)" echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." echo "If a parameter is chosen, a value must be provided or an error will occur." echo "--------------------" @@ -110,7 +110,7 @@ then echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" - echo " default is a new directory that is created in the current directory" + echo " default is the current directory" echo " -min --min minimum size of footprints; default is 10" echo " -max --max maximum size of footprints; default is 80" echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" @@ -258,6 +258,7 @@ else fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +# also creates a small output file with information about the comparison Rscript --vanilla /bin/merge.R $min $max "$workdir" $data From e861fdcdb584816acdb7c3be24fc63f86be09811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Mon, 10 Dec 2018 05:42:40 -0500 Subject: [PATCH 090/158] minor changes --- masterenv.yml | 6 +----- pipeline.nf | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/masterenv.yml b/masterenv.yml index a977fe1..f776235 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -1,10 +1,6 @@ name: masterenv -channels: - - bioconda - - conda-forge dependencies: - - bedtools - r-seqinr - numpy - pybigWig @@ -24,4 +20,4 @@ dependencies: - biopython - pybedtools - matplotlib - - seaborn \ No newline at end of file + - seaborn diff --git a/pipeline.nf b/pipeline.nf index 000230f..3becaf6 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -144,7 +144,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./ + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o . """ } From 638133b9ccd9c61513ec36c6097f3fbb5097bc7a Mon Sep 17 00:00:00 2001 From: basti Date: Mon, 10 Dec 2018 13:27:40 +0100 Subject: [PATCH 091/158] Implemented --dir parameter for remote data directory --- bin/Modules/Ensembl/ActivityCategorizer.py | 11 +++-- bin/Modules/Ensembl/ActivityTable.py | 7 ++- bin/Modules/Ensembl/Ensembl.py | 10 ++--- .../Ensembl/FTPHandling/URLRetrieve.py | 4 +- .../Ensembl/FTPHandling/VersionChecker.py | 44 ++++++++++++------- bin/Modules/Ensembl/GTFGen.py | 12 ++--- bin/Modules/ucsc/ucsc.py | 7 ++- bin/RegGTFExtractor.py | 24 +++++----- 8 files changed, 71 insertions(+), 48 deletions(-) mode change 100644 => 100755 bin/RegGTFExtractor.py diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index 3ef0258..bd82a92 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -4,7 +4,7 @@ class ActivityCategorizer: - def __init__(self, release, organism, wd): + def __init__(self, release, organism, wd, data_dir): # List of all Folders with Activity Tables @@ -18,7 +18,7 @@ def __init__(self, release, organism, wd): self.activity = {} - self.get_activity_data(release, organism, wd) + self.get_activity_data(release, organism, wd, data_dir) # Categorized Activity from Json-config print("Categorization: This may take a while") @@ -41,11 +41,14 @@ def read_config(self, organism, wd): return c_dict - def get_activity_data(self, release, organism, wd): + def get_activity_data(self, release, organism, wd, data_dir): for folder in self.folderlist: # Generate path to binary File - file = os.path.join(wd + "/EnsemblData", release, organism, "activity", folder, "table.bin") + if data_dir: + file = os.path.join(data_dir + "/EnsemblData", release, organism, "activity", folder, "table.bin") + else: + file = os.path.join(wd + "/data/EnsemblData", release, organism, "activity", folder, "table.bin") with open(file, "rb") as tables: self.activity[folder] = bytearray(tables.read()) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 34f4719..040d15b 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -15,8 +15,11 @@ class ActivityTable: 4, "activity=NA" """ - def __init__(self, organism, current_release, wd): - self.link = os.path.join(wd + "/EnsemblData/", current_release, organism, "activity") + def __init__(self, organism, current_release, wd, data_dir): + if data_dir: + self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity") + else: + self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index f0a922c..de5839d 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -6,15 +6,15 @@ class Ensembl: - def __init__(self, organism, wd): + def __init__(self, organism, wd, data_dir): print("Starting Ensembl") - self.updater = FTPRetriever(organism, wd) + self.updater = FTPRetriever(organism, wd, data_dir) self.release = self.updater.get_release() - self.acttable = ActivityTable(organism, self.release, wd) + self.acttable = ActivityTable(organism, self.release, wd, data_dir) self.acttable.check_and_generate_activity_table() - self.categorizer = ActivityCategorizer(self.release, organism, wd) + self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir) print("Generating GTF") - self.gtf_generator = GTFGen(organism, self.release, wd) + self.gtf_generator = GTFGen(organism, self.release, wd, data_dir) print("Ensembl Finished !") diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 598a6f5..43feb54 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -15,8 +15,8 @@ def change_dir(self, wd): def get_all_entries(self): return self.ftp.nlst() - def get_all_entries_from_dir(self, dir): - self.change_dir(dir) + def get_all_entries_from_dir(self, dire): + self.change_dir(dire) return self.get_all_entries() def get_all_entries_as_FTPEntry(self): diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 9c3500d..d4c1ca6 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever: And downloading newest version if necessary """ - def __init__(self, organism, wd): + def __init__(self, organism, wd, data_dir): self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() - self.localversion = self.get_current_local_version(wd) - if self.check_version_difference(organism, wd): - self.download_currentversion_version(self.remoteversion, organism, wd) + self.localversion = self.get_current_local_version(wd, data_dir) + if self.check_version_difference(organism, wd, data_dir): + self.download_currentversion_version(self.remoteversion, organism, wd, data_dir) else: print("Newest Version installed, no update needed.") @@ -31,15 +31,25 @@ def get_current_ftp_version(self): print("Current release is "+c_release) return c_release - def check_organism(self, organism, release, wd): - if organism in next(os.walk(os.path.join(wd+"/EnsemblData/"+release+"/")))[1]: - return False + def check_organism(self, organism, release, wd, data_dir): + if data_dir: + if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True else: - print("No Local Version for "+organism+" installed. Installing...") - return True + if organism in next(os.walk(os.path.join(wd+"/data/EnsemblData/"+release+"/")))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True - def get_current_local_version(self, wd): - directories = next(os.walk(os.path.join(wd+"/EnsemblData/")))[1] + def get_current_local_version(self, wd, data_dir): + if data_dir: + directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1] + else: + directories = next(os.walk(os.path.join(wd+"/data/EnsemblData/")))[1] for dir in directories: if "release" in dir: localversion = sorted(directories, reverse=True)[0] @@ -51,7 +61,7 @@ def get_current_local_version(self, wd): print("No Version installed !") return None - def check_version_difference(self, organism, wd): + def check_version_difference(self, organism, wd, data_dir): local_version = self.localversion remote_version = self.remoteversion @@ -64,16 +74,18 @@ def check_version_difference(self, organism, wd): print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version) return True else: - if self.check_organism(organism, local_version, wd): + if self.check_organism(organism, local_version, wd, data_dir): return True else: return False - def download_currentversion_version(self, version, organism, wd): + def download_currentversion_version(self, version, organism, wd, data_dir): # Download Base File - - targetfolder = os.path.join(wd+"/EnsemblData/", version, organism) + if data_dir: + targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism) + else: + targetfolder = os.path.join(wd+"/data/EnsemblData/", version, organism) os.makedirs(targetfolder) folder_url = "/pub/"+version+"/regulation/"+organism+"/" self.site_ftp.change_dir(folder_url) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index cd70f3b..6a1a63b 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -5,14 +5,16 @@ class GTFGen: - def __init__(self, organism, release, wd): + def __init__(self, organism, release, wd, data_dir): - self.gff_lines = self.get_organism_as_gff(organism, release, wd) + self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} - def get_organism_as_gff(self, organism, release, wd): - - directory = os.path.join(wd + "/EnsemblData/", release, organism) + def get_organism_as_gff(self, organism, release, wd, data_dir): + if data_dir: + directory = os.path.join(data_dir + "/EnsemblData/", release, organism) + else: + directory = os.path.join(wd + "/data/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 49e44ad..c8ad466 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -7,10 +7,13 @@ class UcscGtf: - def __init__(self, org, wd): + def __init__(self, org, wd, data_dir): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = os.path.join(wd + "/UCSCData/"+self.organism_id+".bed") + if data_dir: + self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") + else: + self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") print("Path to Bin: " + self.path_to_bin) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py old mode 100644 new mode 100755 index e4657be..6f0e11c --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -13,14 +13,14 @@ def check_for_local_folder(wd): if not os.path.isdir(os.path.join(wd+"/EnsemblData")): - os.mkdir(os.path.join(wd+"/EnsemblData")) + os.mkdir(os.path.join(wd+"/data/EnsemblData")) - if not os.path.isdir(os.path.join(wd+"/UCSCData" )): - os.mkdir(os.path.join(wd+"/UCSCData" )) + if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): + os.mkdir(os.path.join(wd+"/data/UCSCData")) def check_filter(tissue_cmd, org, wd): - path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" ) + path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") tissues_config = [] if not tissue_cmd: return False @@ -36,9 +36,9 @@ def check_filter(tissue_cmd, org, wd): return False -def main_script(org, wd, tissuetype=None): - - check_for_local_folder(wd) +def main_script(org, wd, data_dir, tissuetype=None): + if not data_dir: + check_for_local_folder(wd) if check_filter(tissuetype, org, wd): tissues = tissuetype print("Filter detected !") @@ -46,8 +46,8 @@ def main_script(org, wd, tissuetype=None): tissues = None print("Filter not detected !") - ucsc = UcscGtf(org, wd) - ense = Ensembl(org, wd) + ucsc = UcscGtf(org, wd, data_dir) + ense = Ensembl(org, wd, data_dir) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) ResultSaver(unique_filter.get_results(), org, tissues) @@ -57,12 +57,12 @@ def main_script(org, wd, tissuetype=None): parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) - parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) + parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) args = vars(parser.parse_args()) - print("Working Dir: " + args["wd"]) if args["organism"]: print("Working Dir: " + args["wd"]) - main_script(args["organism"], args["wd"], args["tissue"]) + main_script(args["organism"], args["wd"], args["dir"], args["tissue"]) else: print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") From 0a0e9b8cf35c27ab95b7693e247a60d10a772a36 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 13:56:53 +0100 Subject: [PATCH 092/158] correcting some errors --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 02f4ec4..5c93985 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. -On some systems Nrxtflow exits the run with following error: +On some systems Nextflow exits the run with following error: ``` Caused by: Failed to create Conda environment @@ -81,7 +81,7 @@ Optional arguments: --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - Moitf clustering: + Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) From c1cb8ca922c1b5a439745cd22da17765e4ba8879 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Mon, 10 Dec 2018 14:38:02 +0100 Subject: [PATCH 093/158] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5c93985..2e16064 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters +For a detailed overview of all Parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: --input Path to BigWig-file From 42d32387b14945a46b53e234cac0dff80d63d825 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 14:43:14 +0100 Subject: [PATCH 094/158] adding comment about the footprints_extraction --- pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index b1084df..866def8 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -117,7 +117,7 @@ All arguments can be set in the configuration files. bigwig_input.combine(bed_input).into {footprint_in} /* - +this process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { conda "${path_env}" From 198334ab0c44ec2dc65431e2a13e351948785d38 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Mon, 10 Dec 2018 14:45:15 +0100 Subject: [PATCH 095/158] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e16064..f53d9df 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters -For a detailed overview of all Parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). +For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: --input Path to BigWig-file From 5a7a52683b15b85946779d8e04abc792866fa586 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 14:52:53 +0100 Subject: [PATCH 096/158] improvement of parameters for my part --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f53d9df..09b7ee3 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,8 @@ nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [ For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --input Path to BigWig-file - --bed Path to BED-file + --input Path to BigWig-file with scores on the peaks of interest + --bed Path to BED-file with peaks of interest corresponding to the BigWig file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format From ffed012ffbdfbbccf27464576a77ec4eb083c118 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Mon, 10 Dec 2018 14:54:19 +0100 Subject: [PATCH 097/158] improve parameters for my part once more --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 09b7ee3..66b2b1f 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,9 @@ Required arguments: Optional arguments: Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT(Default: 0) + --window_length INT (Default: 200) a length of a window + --step INT (Default: 100) an interval to slide the window + --percentage INT(Default: 0) a percentage to be added to background while searching for footprints Filter unknown motifs: --min_size_fp INT (Default: 10) From 15aa68a214d05e7581714623db1a215f2d3a21e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Mon, 10 Dec 2018 09:13:39 -0500 Subject: [PATCH 098/158] help message fix --- pipeline.nf | 71 ++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index f458f83..516b7da 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,10 +1,4 @@ -//!/usr/bin/env nextflow - -Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} -Channel.fromPath(params.bed).set {bed_input} -Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} -Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} -Channel.fromPath(params.config).set {config} +#!/usr/bin/env nextflow //setting default values params.input="" @@ -65,36 +59,36 @@ log.info """ Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] Required arguments: - --input Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --jaspar_db Path to motif-database in MEME-format + --input Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --jaspar_db Path to motif-database in MEME-format Optional arguments: Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT(Default: 0) + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT (Default: 0) Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: - Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length INT Remove all sequences below this value. (Default: 10) - - Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) @@ -103,21 +97,27 @@ Optional arguments: --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Moitf clustering: - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [homo_sapiens | mus_musculus] - --tissues + --organism [homo_sapiens | mus_musculus] + --tissues All arguments can be set in the configuration files. """ +} else { + Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} + Channel.fromPath(params.bed).set {bed_input} + Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} + Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} + Channel.fromPath(params.config).set {config} } bigwig_input.combine(bed_input).set{footprint_in} /* -this process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest +This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest */ process footprint_extraction { conda "${path_env}" @@ -225,7 +225,7 @@ process bed_to_clustered_fasta { tag{name} input: - set name, file (bed) from clustered_bed + set name, file (bed) from bed_for_motif_esitmation when: params.fasta == false @@ -349,7 +349,6 @@ process clustered_glam2 { """ } -*/ /* Running Tomtom on meme-files generated by GLAM2. From 30c17cc6cc3fe2d0da11615df899a0df0b09cb87 Mon Sep 17 00:00:00 2001 From: basti Date: Mon, 10 Dec 2018 16:25:23 +0100 Subject: [PATCH 099/158] Updated README. Added CrossMap functionality. Added mm9 and hg19 support. --- README.md | 8 +- bin/Modules/CrossMap.py | 1759 ++++++++++++++++++++++++++++++++++ bin/Modules/CrossMapper.py | 40 + bin/Modules/SaveResults.py | 13 +- bin/Modules/ucsc/bigBedToBed | Bin bin/RegGTFExtractor.py | 61 +- masterenv.yml | 3 +- 7 files changed, 1868 insertions(+), 16 deletions(-) create mode 100644 bin/Modules/CrossMap.py create mode 100644 bin/Modules/CrossMapper.py mode change 100644 => 100755 bin/Modules/ucsc/bigBedToBed diff --git a/README.md b/README.md index 02f4ec4..7eb9268 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Required arguments: --bed Path to BED-file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format + --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] Optional arguments: @@ -86,9 +87,10 @@ Optional arguments: --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [homo_sapiens | mus_musculus] - --tissues - + --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) + --wd PATH current working directory for the script (default: ".") + --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") + --out PATH directory for the output GTF (default: ".") All arguments can be set in the configuration files. ``` diff --git a/bin/Modules/CrossMap.py b/bin/Modules/CrossMap.py new file mode 100644 index 0000000..97cfe0b --- /dev/null +++ b/bin/Modules/CrossMap.py @@ -0,0 +1,1759 @@ +#!/home/basti/miniconda3/envs/mpi-project/bin/python +''' +--------------------------------------------------------------------------------------- +CrossMap: lift over genomic coordinates between genome assemblies. +Supports BED/BedGraph, GFF/GTF, BAM/SAM, BigWig/Wig, VCF format files. +------------------------------------------------------------------------------------------ +''' + +import os,sys +import optparse +import collections +import subprocess +import string +from textwrap import wrap +from time import strftime + +import pyBigWig +import pysam +from bx.intervals import * +import numpy as np +import datetime +from cmmodule import ireader +from cmmodule import BED +from cmmodule import annoGene +from cmmodule import sam_header +from cmmodule import myutils +from cmmodule import bgrMerge + +__author__ = "Liguo Wang, Hao Zhao" +__contributor__="Liguo Wang, Hao Zhao" +__copyright__ = "Copyleft" +__credits__ = [] +__license__ = "GPLv2" +__version__="0.3.1" +__maintainer__ = "Liguo Wang" +__email__ = "wangliguo78@gmail.com" +__status__ = "Production" + +def printlog (mesg_lst): + ''' + print progress into stderr + ''' + if len(mesg_lst)==1: + msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg_lst[0] + else: + msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + ' '.join(mesg_lst) + print(msg, file=sys.stderr) + +def parse_header( line ): + return dict( [ field.split( '=' ) for field in line.split()[1:] ] ) + +def revcomp_DNA(dna): + ''' + reverse complement of input DNA sequence. + ''' + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','N':'N','X':'X'} + seq = dna.upper() + return ''.join(complement[base] for base in reversed(seq)) + +def wiggleReader( f ): + ''' + Iterator yielding chrom, start, end, strand, value. + Values are zero-based, half-open. + Regions which lack scores are ignored. + ''' + current_chrom = None + current_pos = None + current_step = None + + # always for wiggle data + strand = '+' + + mode = "bed" + for line in ireader.reader(f): + if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ): + continue + elif line.startswith( "variableStep" ): + header = parse_header( line ) + current_chrom = header['chrom'] + current_pos = None + current_step = None + if 'span' in header: current_span = int( header['span'] ) + else: current_span = 1 + mode = "variableStep" + elif line.startswith( "fixedStep" ): + header = parse_header( line ) + current_chrom = header['chrom'] + current_pos = int( header['start'] ) - 1 + current_step = int( header['step'] ) + if 'span' in header: current_span = int( header['span'] ) + else: current_span = 1 + mode = "fixedStep" + elif mode == "bed": + fields = line.split() + if len( fields ) > 3: + if len( fields ) > 5: + yield fields[0], int( fields[1] ), int( fields[2] ), fields[5], float( fields[3] ) + else: + yield fields[0], int( fields[1] ), int( fields[2] ), strand, float( fields[3] ) + elif mode == "variableStep": + fields = line.split() + pos = int( fields[0] ) - 1 + yield current_chrom, pos, pos + current_span, strand, float( fields[1] ) + elif mode == "fixedStep": + yield current_chrom, current_pos, current_pos + current_span, strand, float( line.split()[0] ) + current_pos += current_step + else: + raise "Unexpected input line: %s" % line.strip() + +def bigwigReader(infile, bin_size = 2000): + ''' + infile: bigwig format file + chromsize: chrom_name: size. + return: chrom, position (0-based), value + ''' + + bw = pyBigWig.open(infile) + chrom_sizes = bw.chroms() + #print (chrom_sizes) + + for chr_name, chr_size in list(chrom_sizes.items()): + for chrom, st, end in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = bin_size): + try: + a = bw.stats(chrom,st,end) + except: + continue + + if bw.stats(chrom,st,end)[0] is None: + continue + sig_list = bw.values(chrom,st,end) + sig_list = np.nan_to_num(sig_list) #change 'nan' into '0.' + low_bound = st + point_num = 1 + score = sig_list[0] + for value in (sig_list[1:]): + if value == score: + point_num += 1 + else: + yield(( chrom, low_bound, low_bound + point_num, score)) + score = value + low_bound = low_bound + point_num + point_num = 1 + + +def check_bed12(bedline): + ''' + check if bed12 format is correct or not + ''' + fields = bedline.strip().split() + if len(fields) !=12: + return False + if fields[5] not in ['+','-','.']: + return False + try: + chromStart = int(fields[1]) + chromEnd = int(fields[2]) + thickStart = int(fields[6]) + thickEnd = int(fields[7]) + blockCount = int(fields[9]) + blockSizes = [int(i) for i in fields[10].rstrip(',').split(',')] + blockStarts = [int(i) for i in fields[11].rstrip(',').split(',')] + except: + return False + if chromStart > chromEnd or thickStart > thickEnd: + return False + if thickStart < chromStart or thickEnd > chromEnd: + return False + if len(blockSizes) != blockCount: + return False + if len(blockStarts) != blockCount: + return False + if blockCount <1: + return False + for i in blockSizes: + if i < 0: return False + for i in blockStarts: + if i < 0: return False + return True + +def intersectBed(xxx_todo_changeme, xxx_todo_changeme1): + ''' + return intersection of two bed regions + ''' + (chr1, st1, end1) = xxx_todo_changeme + (chr2, st2, end2) = xxx_todo_changeme1 + if int(st1) > int(end1) or int(st2) > int(end2): + raise Exception ("Start cannot be larger than end") + if chr1 != chr2: + return None + if int(st1) > int(end2) or int(end1) < int(st2): + return None + return (chr1, max(st1, st2), min(end1,end2)) + +def read_chain_file (chain_file, print_table=False): + ''' + input chain_file could be either plain text, compressed file (".gz", ".Z", ".z", ".bz", ".bz2", ".bzip2"), + or a URL pointing to the chain file ("http://", "https://", "ftp://"). If url was used, chain file must be plain text + ''' + + printlog(["Read chain_file: ", chain_file]), + maps={} + target_chromSize={} + source_chromSize={} + if print_table: + blocks=[] + + for line in ireader.reader(chain_file): + # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1 + if not line.strip(): + continue + line=line.strip() + if line.startswith(('#',' ')):continue + fields = line.split() + + if fields[0] == 'chain' and len(fields) in [12, 13]: + score = int(fields[1]) # Alignment score + source_name = fields[2] # E.g. chrY + source_size = int(fields[3]) # Full length of the chromosome + source_strand = fields[4] # Must be + + if source_strand != '+': + raise Exception("Source strand in an .over.chain file must be +. (%s)" % line) + source_start = int(fields[5]) # Start of source region + source_end = int(fields[6]) # End of source region + + target_name = fields[7] # E.g. chr5 + target_size = int(fields[8]) # Full length of the chromosome + target_strand = fields[9] # + or - + target_start = int(fields[10]) + target_end = int(fields[11]) + target_chromSize[target_name]= target_size + source_chromSize[source_name] = source_size + + if target_strand not in ['+', '-']: + raise Exception("Target strand must be - or +. (%s)" % line) + #if target_strand == '+': + # target_start = int(fields[10]) + # target_end = int(fields[11]) + #if target_strand == '-': + # target_start = target_size - target_end + # target_end = target_size - target_start + chain_id = None if len(fields) == 12 else fields[12] + if source_name not in maps: + maps[source_name] = Intersecter() + + sfrom, tfrom = source_start, target_start + + # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to) + elif fields[0] != 'chain' and len(fields) == 3: + size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2]) + if print_table: + if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand)) + elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand)) + + if target_strand == '+': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand))) + elif target_strand == '-': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand))) + + sfrom += size + sgap + tfrom += size + tgap + + elif fields[0] != 'chain' and len(fields) == 1: + size = int(fields[0]) + if print_table: + if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand)) + elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand)) + + if target_strand == '+': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand))) + elif target_strand == '-': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand))) + else: + raise Exception("Invalid chain format. (%s)" % line) + if (sfrom + size) != source_end or (tfrom + size) != target_end: + raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header) + + if print_table: + for i in blocks: + print('\t'.join([str(n) for n in i])) + + return (maps,target_chromSize, source_chromSize) + + +def map_coordinates(mapping, q_chr, q_start, q_end, q_strand='+', print_match=False): + ''' + Map coordinates from source assembly to target assembly + ''' + + matches=[] + complement ={'+':'-','-':'+'} + if q_chr not in mapping: + return None + else: + targets = mapping[q_chr].find(q_start, q_end) + if len(targets)==0: + return None + elif len(targets)==1: + s_start = targets[0].start + s_end = targets[0].end + t_chrom = targets[0].value[0] + t_start = targets[0].value[1] + t_end = targets[0].value[2] + t_strand = targets[0].value[3] + + (chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end)) + l_offset = abs(real_start - s_start) + #r_offset = real_end - s_end + size = abs(real_end - real_start) + + matches.append( (chr, real_start, real_end,q_strand)) + if t_strand == '+': + i_start = t_start + l_offset + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + elif t_strand == '-': + i_start = t_end - l_offset - size + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + else: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand) + + elif len(targets) > 1: + for t in targets: + s_start = t.start + s_end = t.end + t_chrom = t.value[0] + t_start = t.value[1] + t_end = t.value[2] + t_strand = t.value[3] + + (chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end)) + + l_offset = abs(real_start - s_start) + #r_offset = abs(real_end - s_end) + size = abs(real_end - real_start) + + matches.append( (chr, real_start, real_end,q_strand) ) + if t_strand == '+': + i_start = t_start + l_offset + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + elif t_strand == '-': + i_start = t_end - l_offset - size + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + else: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand) + + if print_match: + print(matches) + # input: 'chr1',246974830,247024835 + # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')] + # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)] + + return matches + +def crossmap_vcf_file(mapping, infile,outfile, liftoverfile, refgenome): + ''' + Convert genome coordinates in VCF format. + ''' + + #index refegenome file if it hasn't been done + if not os.path.exists(refgenome + '.fai'): + printlog(["Creating index for", refgenome]) + pysam.faidx(refgenome) + + refFasta = pysam.Fastafile(refgenome) + + FILE_OUT = open(outfile ,'w') + UNMAP = open(outfile + '.unmap','w') + + total = 0 + fail = 0 + withChr = False # check if the VCF data lines use 'chr1' or '1' + + for line in ireader.reader(infile): + if not line.strip(): + continue + line=line.strip() + + #deal with meta-information lines. + + #meta-information lines needed in both mapped and unmapped files + if line.startswith('##fileformat'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##INFO'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##FILTER'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##FORMAT'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##ALT'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##SAMPLE'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##PEDIGREE'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + + #meta-information lines needed in unmapped files + elif line.startswith('##assembly'): + print(line, file=UNMAP) + elif line.startswith('##contig'): + print(line, file=UNMAP) + if 'ID=chr' in line: + withChr = True + + #update contig information + elif line.startswith('#CHROM'): + printlog(["Updating contig field ... "]) + target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) + for chr_id in sorted(target_gsize): + if chr_id.startswith('chr'): + if withChr is True: + print("##contig=" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + print("##contig=" % (chr_id.replace('chr',''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + if withChr is True: + print("##contig=" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + print("##contig=" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + + print("##liftOverProgram=CrossMap(https://sourceforge.net/projects/crossmap/)", file=FILE_OUT) + print("##liftOverFile=" + liftoverfile, file=FILE_OUT) + print("##new_reference_genome=" + refgenome, file=FILE_OUT) + print("##liftOverTime=" + datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) + print(line, file=FILE_OUT) + print(line, file=UNMAP) + + else: + if line.startswith('#'):continue + fields = str.split(line,maxsplit=7) + total += 1 + + if fields[0].startswith('chr'): + chrom = fields[0] + else: + chrom = 'chr' + fields[0] + + start = int(fields[1])-1 # 0 based + end = start + len(fields[3]) + a = map_coordinates(mapping, chrom, start, end,'+') + if a is None: + print(line, file=UNMAP) + fail += 1 + continue + + if len(a) == 2: + # update chrom + target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' + target_start = a[1][1] + target_end = a[1][2] + if withChr is False: + fields[0] = target_chr.replace('chr','') + else: + fields[0] = target_chr + + # update start coordinate + fields[1] = target_start + 1 + + if refFasta.references[0].startswith('chr'): + if target_chr.startswith('chr'): + fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() + else: + fields[3] = refFasta.fetch('chr'+target_chr,target_start,target_end).upper() + else: + if target_chr.startswith('chr'): + fields[3] = refFasta.fetch(target_chr.replace('chr',''),target_start,target_end).upper() + else: + fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() + + + + if fields[3] != fields[4]: + print('\t'.join(map(str, fields)), file=FILE_OUT) + else: + print(line, file=UNMAP) + fail += 1 + else: + print(line, file=UNMAP) + fail += 1 + continue + FILE_OUT.close() + UNMAP.close() + printlog (["Total entries:", str(total)]) + printlog (["Failed to map:", str(fail)]) + + +def crossmap_bed_file(mapping, inbed,outfile=None): + ''' + Convert genome coordinates (in bed format) between assemblies. + BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 + ''' + + # check if 'outfile' was set. If not set, print to screen, if set, print to file + if outfile is not None: + FILE_OUT = open(outfile,'w') + UNMAP = open(outfile + '.unmap','w') + else: + pass + + for line in ireader.reader(inbed): + if line.startswith('#'):continue + if line.startswith('track'):continue + if line.startswith('browser'):continue + if not line.strip():continue + line=line.strip() + fields=line.split() + strand = '+' + + # filter out line less than 3 columns + if len(fields)<3: + print("Less than 3 fields. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + try: + int(fields[1]) + except: + print("Start corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + try: + int(fields[2]) + except: + print("End corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + if int(fields[1]) > int(fields[2]): + print("\"Start\" is larger than \"End\" corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + + # deal with bed less than 12 columns + if len(fields)<12: + + # try to reset strand + try: + for f in fields: + if f in ['+','-']: + strand = f + except: + pass + + a = map_coordinates(mapping, fields[0], int(fields[1]),int(fields[2]),strand) + # example of a: [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] + + try: + if len(a) % 2 != 0: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + continue + if len(a) == 2: + + #reset fields + fields[0] = a[1][0] + fields[1] = a[1][1] + fields[2] = a[1][2] + for i in range(0,len(fields)): #update the strand information + if fields[i] in ['+','-']: + fields[i] = a[1][3] + + if outfile is None: + print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + if len(a) >2 : + count=0 + for j in range(1,len(a),2): + count += 1 + fields[0] = a[j][0] + fields[1] = a[j][1] + fields[2] = a[j][2] + for i in range(0,len(fields)): #update the strand information + if fields[i] in ['+','-']: + fields[i] = a[j][3] + + if outfile is None: + print(line + '\t'+ '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j-1]]) + ')\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + except: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + continue + + # deal with bed12 and bed12+8 (genePred format) + if len(fields)==12 or len(fields)==20: + strand = fields[5] + if strand not in ['+','-']: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand) + fail_flag=False + exons_old_pos = annoGene.getExonFromLine(line) #[[chr,st,end],[chr,st,end],...] + #print exons_old_pos + exons_new_pos = [] + for e_chr, e_start, e_end in exons_old_pos: + a = map_coordinates(mapping, e_chr, e_start, e_end, strand) + if a is None: + fail_flag =True + break + + if len(a) == 2: + exons_new_pos.append(a[1]) + # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] + else: + fail_flag =True + break + + if not fail_flag: + # check if all exons were mapped to the same chromosome the same strand + chr_id = set() + exon_strand = set() + + for e_chr, e_start, e_end, e_strand in exons_new_pos: + chr_id.add(e_chr) + exon_strand.add(e_strand) + if len(chr_id) != 1 or len(exon_strand) != 1: + fail_flag = True + + if not fail_flag: + # build new bed + cds_start_offset = int(fields[6]) - int(fields[1]) + cds_end_offset = int(fields[2]) - int(fields[7]) + new_chrom = exons_new_pos[0][0] + new_chrom_st = exons_new_pos[0][1] + new_chrom_end = exons_new_pos[-1][2] + new_name = fields[3] + new_score = fields[4] + new_strand = exons_new_pos[0][3] + new_thickStart = new_chrom_st + cds_start_offset + new_thickEnd = new_chrom_end - cds_end_offset + new_ittemRgb = fields[8] + new_blockCount = len(exons_new_pos) + new_blockSizes = ','.join([str(o - n) for m,n,o,p in exons_new_pos]) + new_blockStarts = ','.join([str(n - new_chrom_st) for m,n,o,p in exons_new_pos]) + + new_bedline = '\t'.join(str(i) for i in (new_chrom,new_chrom_st,new_chrom_end,new_name,new_score,new_strand,new_thickStart,new_thickEnd,new_ittemRgb,new_blockCount,new_blockSizes,new_blockStarts)) + if check_bed12(new_bedline) is False: + fail_flag = True + else: + if outfile is None: + print(line + '\t->\t' + new_bedline) + else: + print(new_bedline, file=FILE_OUT) + + if fail_flag: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + +def crossmap_gff_file(mapping, ingff,outfile=None): + ''' + Convert genome coordinates (in GFF/GTF format) between assemblies. + GFF (General Feature Format) lines have nine required fields that must be Tab-separated: + + 1. seqname - The name of the sequence. Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types + are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 + for this annotation data set, the score value will determine the level of gray in + which this feature is displayed (higher numbers = darker gray). If there is no score + value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that + represents the reading frame of the first base. If the feature is not a coding exon, + the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + + GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 + + GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The + first eight GTF fields are the same as GFF. The group field has been expanded into a + list of attributes. Each attribute consists of a type/value pair. Attributes must end + in a semi-colon, and be separated from any following attribute by exactly one space. + + GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4 + + We do NOT check if features (exon, CDS, etc) belonging to the same gene originally were + converted into the same chromosome/strand. + ''' + + if outfile is not None: + FILE_OUT = open(outfile,'w') + UNMAP = open(outfile + '.unmap', 'w') + + for line in ireader.reader(ingff): + if line.startswith('#'):continue + if line.startswith('track'):continue + if line.startswith('browser'):continue + if line.startswith('visibility'):continue + if not line.strip():continue + + line=line.strip() + fields=line.split('\t') + # check GFF file + #if len(fields) != 9: + # print >>sys.stderr, 'GFF file must has 9 columns. Skip ' + line + # continue + try: + start = int(fields[3]) - 1 #0-based + end = int(fields[4])/1 + feature_size = end - start + except: + print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + if fields[6] not in ['+','-','.']: + print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + + strand = '-' if fields[6] == '-' else '+' + + chrom = fields[0] + a = map_coordinates(mapping, chrom,start,end,strand) + if a is None: + if outfile is None: + print(line + '\tfail (no match to target assembly)') + else: + print(line, file=UNMAP) + continue + if len(a) !=2: + if outfile is None: + print(line + '\tfail (multpile match to target assembly)') + else: + print(line, file=UNMAP) + else: + if (int(a[1][2]) - int(a[1][1])) != feature_size: # check if it is exact match + if outfile is None: + print(line + '\tfail (not exact match)') + else: + print(line, file=UNMAP) + fields[0] = a[1][0] # chrom + fields[3] = a[1][1] + 1 # start, 1-based + fields[4] = a[1][2] + fields[6] = a[1][3] + + if outfile is None: + print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + +def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3, addtag = True): + ''' + Convert genome coordinates (in BAM/SAM format) between assemblies. + BAM/SAM format: http://samtools.sourceforge.net/ + chrom_size is target chromosome size + + if addtag is set to True, will add tags to each alignmnet: + Q = QC (QC failed) + N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) + M = multiple mapped (alignment can be liftover to multiple places) + U = unique mapped (alignment can be liftover to only 1 place) + + tags for pair-end sequencing include: + + QF: QC failed + + NN: both read1 and read2 unmapped + NU: read1 unmapped, read2 unique mapped + NM: read1 unmapped, multiple mapped + + UN: read1 uniquely mapped, read2 unmap + UU: both read1 and read2 uniquely mapped + UM: read1 uniquely mapped, read2 multiple mapped + + MN: read1 multiple mapped, read2 unmapped + MU: read1 multiple mapped, read2 unique mapped + MM: both read1 and read2 multiple mapped + + tags for single-end sequencing include: + + QF: QC failed + SN: unmaped + SM: multiple mapped + SU: uniquely mapped + + ''' + + # determine the input file format (BAM or SAM) + try: + samfile = pysam.Samfile(infile,'rb') + if len(samfile.header) == 0: + print("BAM file has no header section. Exit!", file=sys.stderr) + sys.exit(1) + bam_format = True + except: + samfile = pysam.Samfile(infile,'r') + if len(samfile.header) == 0: + print("SAM file has no header section. Exit!", file=sys.stderr) + sys.exit(1) + bam_format = False + + # add comments into BAM/SAM header section + if bam_format: + comments=['Liftover from original BAM file: ' + infile] + else: + comments=['Liftover from original SAM file: ' + infile] + comments.append('Liftover is based on the chain file: ' + chainfile) + + sam_ori_header = samfile.header + (new_header, name_to_id) = sam_header.bam_header_generator(orig_header = sam_ori_header, chrom_size = chrom_size, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=comments) + + # write to file + if outfile_prefix is not None: + if bam_format: + OUT_FILE = pysam.Samfile( outfile_prefix + '.bam', "wb", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.bam', "wb", template=samfile ) + printlog (["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) + else: + OUT_FILE = pysam.Samfile( outfile_prefix + '.sam', "wh", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.sam', "wh", template=samfile ) + printlog (["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) + # write to screen + else: + if bam_format: + OUT_FILE = pysam.Samfile( '-', "wb", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.bam','') + '.unmap.bam', "wb", template=samfile ) + printlog (["Liftover BAM file:", infile]) + else: + OUT_FILE = pysam.Samfile( '-', "w", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.sam','') + '.unmap.sam', "wh", template=samfile ) + printlog (["Liftover SAM file:", infile]) + + QF = 0 + NN = 0 + NU = 0 + NM = 0 + UN = 0 + UU = 0 + UM = 0 + MN = 0 + MU = 0 + MM = 0 + SN = 0 + SM = 0 + SU = 0 + total_item = 0 + try: + while(1): + total_item += 1 + new_alignment = pysam.AlignedRead() # create AlignedRead object + old_alignment = next(samfile) + + # qcfailed reads will be written to OUT_FILE_UNMAP for both SE and PE reads + if old_alignment.is_qcfail: + QF += 1 + if addtag: old_alignment.set_tag(tag="QF", value=0) + OUT_FILE.write(old_alignment) + continue + # new_alignment.qqual = old_alignment.qqual # quality string of read, exclude soft clipped part + # new_alignment.qlen = old_alignment.qlen # length of the "aligned part of read" + # new_alignment. aligned_pairs = old_alignment. aligned_pairs #read coordinates <=> ref coordinates + # new_alignment_aend = old_alignment.aend # reference End position of the aligned part (of read) + + #new_alignment.qname = old_alignment.qname # 1st column. read name. + #new_alignment.flag = old_alignment.flag # 2nd column. subject to change. flag value + #new_alignment.tid = old_alignment.tid # 3rd column. samfile.getrname(tid) == chrom name + #new_alignment.pos = old_alignment.pos # 4th column. reference Start position of the aligned part (of read) [0-based] + #new_alignment.mapq = old_alignment.mapq # 5th column. mapping quality + #new_alignment.cigar= old_alignment.cigar # 6th column. subject to change. + #new_alignment.rnext = old_alignment.rnext # 7th column. tid of the reference (mate read mapped to) + #new_alignment.pnext = old_alignment.pnext # 8th column. position of the reference (0 based, mate read mapped to) + #new_alignment.tlen = old_alignment.tlen # 9th column. insert size + #new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. + #new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. + #new_alignment.tags = old_alignment.tags # 12 - columns + + new_alignment.qname = old_alignment.qname # 1st column. read name. + new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. + new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. + new_alignment.tags = old_alignment.tags # 12 - columns + + + # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes + # Thanks Wolfgang Resch identified this bug and provided solution. + + try: + rg, rgt = old_alignment.get_tag("RG", with_value_type=True) + except KeyError: + pass + else: + new_alignment.set_tag("RG", str(rg), rgt) + + if old_alignment.is_paired: + new_alignment.flag = 0x0001 #pair-end + if old_alignment.is_read1: + new_alignment.flag = new_alignment.flag | 0x40 + elif old_alignment.is_read2: + new_alignment.flag = new_alignment.flag | 0x80 + + #================================== + # R1 is originally unmapped + #================================== + if old_alignment.is_unmapped: + #This line will set the unmapped read flag for the first read in pair (in case it is unmapped). + #Thanks Amir Keivan Mohtashami reporting this bug. + new_alignment.flag = new_alignment.flag | 0x4 + + #------------------------------------ + # both R1 and R2 unmapped + #------------------------------------ + if old_alignment.mate_is_unmapped: + NN += 1 + if addtag: old_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(old_alignment) + continue + else: # originally, read-1 unmapped, read-2 is mapped + try: + read2_chr = samfile.getrname(old_alignment.rnext) + read2_strand = '-' if old_alignment.mate_is_reverse else '+' + read2_start = old_alignment.pnext + read2_end = read2_start + 1 + read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand) + # [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )] + except: + read2_maps = None + + #------------------------------------ + # both R1 and R2 unmapped + #------------------------------------ + if read2_maps is None: + NN += 1 + if addtag: old_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(old_alignment) + continue + + #------------------------------------ + # R1 unmapped, R2 unique + #------------------------------------ + elif len(read2_maps) == 2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's + # 4 + new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NU += 1 + if addtag: new_alignment.set_tag(tag="NU", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 unmapped, R2 multiple + #------------------------------------ + else: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] + # 4 + new_alignment.pos = read2_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NM += 1 + if addtag: new_alignment.set_tag(tag="NM", value=0) + OUT_FILE.write(new_alignment) + continue + + #================================== + # R1 is originally mapped + #================================== + else: + try: + read1_chr = samfile.getrname(old_alignment.tid) + read1_strand = '-' if old_alignment.is_reverse else '+' + read1_start = old_alignment.pos + read1_end = old_alignment.aend + read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) + except: + read1_maps = None + + if not old_alignment.mate_is_unmapped: + try: + read2_chr = samfile.getrname(old_alignment.rnext) + read2_strand = '-' if old_alignment.mate_is_reverse else '+' + read2_start = old_alignment.pnext + read2_end = read2_start + 1 + read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand) + # [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )] + except: + read2_maps = None + + + #------------------------------------ + # R1 unmapped (failed to liftover) + #------------------------------------ + if read1_maps is None: + # 2 update flag (0x4: segment unmapped) + new_alignment.flag = new_alignment.flag | 0x4 + # 3 + new_alignment.tid = -1 + # 4 + new_alignment.pos = 0 + # 5 + new_alignment.mapq = 255 + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + + # (1) read2 is unmapped before conversion + if old_alignment.mate_is_unmapped: + NN += 1 + if addtag: new_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) read2 is unmapped after conversion + elif read2_maps is None: + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + + NN += 1 + if addtag: new_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) read2 is unique mapped + elif len(read2_maps) == 2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's + # 4 + new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NU += 1 + if addtag: new_alignment.set_tag(tag="NU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (4) read2 is multiple mapped + else: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] + # 4 + new_alignment.pos = read2_maps[1][1] + # 5 + new_alignment.mapq = 255 # mapq not available + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NM += 1 + if addtag:new_alignment.set_tag(tag="NM", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 uniquely mapped + #------------------------------------ + elif len(read1_maps) == 2: + + if read1_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + + if read1_maps[0][3] != read1_maps[1][3]: # opposite strand + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + elif read1_maps[0][3] == read1_maps[1][3]: # same strand + # 6 + new_alignment.cigar = old_alignment.cigar + + # 3 + new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom + # 4 + new_alignment.pos = read1_maps[1][1] #start + # 5 + new_alignment.mapq = old_alignment.mapq + + # (1) R2 unmapped before or after conversion + if (old_alignment.mate_is_unmapped) or (read2_maps is None): + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + # 7 + new_alignment.rnext = name_to_id[read1_maps[1][0]] + # 8 + new_alignment.pnext = read1_maps[1][1] + # 9 + new_alignment.tlen = 0 + + UN += 1 + if addtag: new_alignment.set_tag(tag="UN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) R2 is unique mapped + elif len(read2_maps)==2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + # 2 + if (read2_maps[1][3] != read1_maps[1][3]) and (new_alignment.tlen <= IS_size + fold * IS_std) and (new_alignment.tlen >= IS_size - fold * IS_std): + new_alignment.flag = new_alignment.flag | 0x2 + + UU += 1 + if addtag: new_alignment.set_tag(tag="UU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) R2 is multiple mapped + else: + # 2 (strand) + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 (secondary alignment) + new_alignment.flag = new_alignment.flag | 0x100 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + UM += 1 + if addtag: new_alignment.set_tag(tag="UM", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 multiple mapped + #----------------------------------- + elif len(read1_maps) > 2 and len(read1_maps) % 2 ==0: + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 2 + if read1_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + + if read1_maps[0][3] != read1_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + elif read1_maps[0][3] == read1_maps[1][3]: + new_alignment.cigar = old_alignment.cigar + # 3 + new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom + # 4 + new_alignment.pos = read1_maps[1][1] #start + # 5 + new_alignment.mapq = 255 + + + # (1) R2 is unmapped + if (old_alignment.mate_is_unmapped) or (read2_maps is None): + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + # 7 + new_alignment.rnext = name_to_id[read1_maps[1][0]] + # 8 + new_alignment.pnext = read1_maps[1][1] + # 9 + new_alignment.tlen = 0 + + MN += 1 + if addtag: new_alignment.set_tag(tag="MN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) read2 is unique mapped + elif len(read2_maps)==2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + MU += 1 + if addtag: new_alignment.set_tag(tag="MU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) R2 is multiple mapped + else: + # 2 (strand) + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 (secondary alignment) + new_alignment.flag = new_alignment.flag | 0x100 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + MM += 1 + if addtag: new_alignment.set_tag(tag="MM", value=0) + OUT_FILE.write(new_alignment) + continue + else: + #old_alignment.tid = name_to_id[samfile.getrname(old_alignment.tid)] + OUT_FILE_UNMAP.write(old_alignment) + failed += 1 + + # Singel end sequencing + else: + # (1) originally unmapped + if old_alignment.is_unmapped: + SN += 1 + if addtag: old_alignment.set_tag(tag="SN",value=0) + OUT_FILE.write(old_alignment) + continue + else: + new_alignment.flag = 0x0 + read_chr = samfile.getrname(old_alignment.tid) + read_strand = '-' if old_alignment.is_reverse else '+' + read_start = old_alignment.pos + read_end = old_alignment.aend + read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) + + # (2) unmapped afte liftover + if read_maps is None: + # 1 + new_alignment.qname = old_alignment.qname + # 2 + new_alignment.flag = new_alignment.flag | 0x4 + # 3 + new_alignment.tid = -1 + # 4 + new_alignment.pos = 0 + # 5 + new_alignment.mapq = 255 + # 6 + new_alignment.cigar= old_alignment.cigar + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + # 12 + new_alignment.tags = old_alignment.tags + + SN += 1 + if addtag: new_alignment.set_tag(tag="SN",value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) unique mapped + if len(read_maps)==2: + # 1 + new_alignment.qname = old_alignment.qname + # 2 + if read_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + if read_maps[0][3] != read_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + else: + # 6 + new_alignment.cigar = old_alignment.cigar + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + + # 3 + new_alignment.tid = name_to_id[read_maps[1][0]] + # 4 + new_alignment.pos = read_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + SU += 1 + if addtag: new_alignment.set_tag(tag="SU",value=0) + OUT_FILE.write(new_alignment) + continue + + # (4) multiple mapped + if len(read_maps) > 2 and len(read_maps) % 2 ==0: + + # 1 + new_alignment.qname = old_alignment.qname + new_alignment.flag = new_alignment.flag | 0x100 + # 2 + if read_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + if read_maps[0][3] != read_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + else: + # 6 + new_alignment.cigar = old_alignment.cigar + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + + # 3 + new_alignment.tid = name_to_id[read_maps[1][0]] + # 4 + new_alignment.pos = read_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + SM += 1 + if addtag: new_alignment.set_tag(tag="SM",value=0) + OUT_FILE.write(new_alignment) + continue + except StopIteration: + printlog(["Done!"]) + OUT_FILE.close() + + if outfile_prefix is not None: + if bam_format: + try: + printlog (['Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')]) + pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') + except: + printlog(["Warning: ","output BAM file was NOT sorted"]) + try: + printlog (['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) + pysam.index(outfile_prefix + '.sorted.bam',outfile_prefix + '.sorted.bam.bai') + except: + printlog(["Warning: ","output BAM file was NOT indexed."]) + + print("Total alignments:" + str(total_item-1)) + print("\tQC failed: " + str(QF)) + if max(NN,NU, NM, UN, UU, UM, MN, MU, MM) > 0: + + print("\tR1 unique, R2 unique (UU): " + str(UU)) + print("\tR1 unique, R2 unmapp (UN): " + str(UN)) + print("\tR1 unique, R2 multiple (UM): " + str(UM)) + + print("\tR1 multiple, R2 multiple (MM): " + str(MM)) + print("\tR1 multiple, R2 unique (MU): " + str(MU)) + print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) + + print("\tR1 unmap, R2 unmap (NN): " + str(NN)) + print("\tR1 unmap, R2 unique (NU): " + str(NU)) + print("\tR1 unmap, R2 multiple (NM): " + str(NM)) + if max(SN,SU,SM) > 0: + print("\tUniquley mapped (SU): " + str(SU)) + print("\tMultiple mapped (SM): " + str(SM)) + print("\tUnmapped (SN): " + str(SN)) + + +def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000): + ''' + Convert genome coordinates (in wiggle/bigwig format) between assemblies. + wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html + bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html + ''' + + OUT_FILE1 = open(out_prefix + '.bgr','w') # original bgr file + OUT_FILE2 = open(out_prefix + '.sorted.bgr','w') # sorted bgr file + OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file + + if in_format.upper() == "WIGGLE": + printlog (["Liftover wiggle file:", in_file, '==>', out_prefix + '.bgr']) + + for chr, start, end, strand, score in wiggleReader (in_file): + #print chr,start,end,score + # map_coordinates(mapping, q_chr, q_start, q_end, q_strand = '+', print_match=False): + maps = map_coordinates(mapping, chr, start, end, '+') + if maps is None: + continue + if len(maps) == 2: + print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1) + else: + continue + maps[:]=[] + OUT_FILE1.close() + + printlog (["Merging overlapped entries in bedGraph file ..."]) + for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): + print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2) + OUT_FILE2.close() + + os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr + + # add header to bigwig file + printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ]) + target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())] + OUT_FILE3.addHeader(target_chroms_sorted) + + printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ]) + # add entries to bigwig file + for line in ireader.reader(out_prefix + '.sorted.bgr'): + r_chr,r_st,r_end,r_value = line.split() + OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) + + OUT_FILE3.close() + + elif in_format.upper() == "BIGWIG": + + printlog (["Liftover bigwig file:", in_file, '==>', out_prefix + '.bgr']) + for chr, start, end, score in bigwigReader(in_file, bin_size = binSize ): + maps = map_coordinates(mapping, chr, start, end, '+') + try: + if maps is None: continue + if len(maps) == 2: + print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1) + else: + continue + except: + continue + maps[:]=[] + OUT_FILE1.close() + + printlog (["Merging overlapped entries in bedGraph file ..."]) + for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): + print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2) + OUT_FILE2.close() + os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr + + # add header to bigwig file + printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ]) + target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())] + OUT_FILE3.addHeader(target_chroms_sorted) + + # add entries to bigwig file + printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ]) + for line in ireader.reader(out_prefix + '.sorted.bgr'): + r_chr,r_st,r_end,r_value = line.split() + OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) + + OUT_FILE3.close() + else: + raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'") + + +def general_help(): + desc="""CrossMap is a program for convenient conversion of genome coordinates and genome \ +annotation files between assemblies (eg. lift from human hg18 to hg19 or vice versa). \ +It supports file in BAM, SAM, BED, Wiggle, BigWig, GFF, GTF and VCF format.""" + + print("Program: %s (v%s)" % ("CrossMap", __version__), file=sys.stderr) + print("\nDescription: \n%s" % '\n'.join(' '+i for i in wrap(desc,width=80)), file=sys.stderr) + print("\nUsage: CrossMap.py [options]\n", file=sys.stderr) + for k in sorted(commands): + print(' ' + k + '\t' + commands[k], file=sys.stderr) + print(file=sys.stderr) + +def bed_help(): + msg =[ + ('Usage:', "CrossMap.py bed input_chain_file input_bed_file [output_file]"), + ('Description:', "\"input_chain_file\" and \"input_bed_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. BED format file must have at least 3 columns (chrom, start, end) and no more than 12 columns. If no \"output_file\" is specified, output will be directed to the screen (console). BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1"), + ('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed test.hg19.bed # write output to \"test.hg19.bed\""), + ('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed # write output to screen"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def gff_help(): + msg =[ + ('Usage:', "CrossMap.py gff input_chain_file input_gff_file output_file"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Input file must be in GFF or GTF format. GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4"), + ('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf test.hg18.gtf # write output to \"test.hg18.gff\""), + ('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf # write output to screen"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def wig_help(): + msg =[ + ('Usage:', "CrossMap.py wig input_chain_file input_wig_file output_prefix"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Both \"variableStep\" and \"fixedStep\" wiggle lines are supported. Wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html"), + ('Example:', "CrossMapy.py wig hg18ToHg19.over.chain.gz test.hg18.wig test.hg19"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) +def bigwig_help(): + msg =[ + ('Usage:', "CrossMap.py bigwig input_chain_file input__bigwig_file output_prefix"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html"), + ('Example:', "CrossMapy.py bigwig hg18ToHg19.over.chain.gz test.hg18.bw test.hg19"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def bam_help(): + usage="CrossMap.py bam -a input_chain_file input_bam_file output_file [options] " + import optparse + parser.print_help() + +def vcf_help(): + msg =[ + ("usage:","CrossMap.py vcf input_chain_file input_VCF_file ref_genome_file output_file"), + ("Description:", "\"input_chain_file\" and \"input_VCF_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. \"ref_genome_file\" is genome sequence file of 'target assembly' in FASTA format."), + ("Example:", " CrossMap.py vcf hg19ToHg18.over.chain.gz test.hg19.vcf hg18.fa test.hg18.vcf"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + + +if __name__=='__main__': + + # test read_chain_file() + #(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(sys.argv[1], print_table=True) + + #q_chr, q_start, q_end, q_strand = '+', print_match=False): + #map_coordinates(mapTree, 'chr1', 45953452, 45953456, '+', print_match=True) + + + #a= sam_header.bam_header_generator(chrom_size = targetChromSizes, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=['a','b','c']) + #crossmap_bed_file(mapTree, 'test.hg18.bed3',outfile='aa') + #crossmap_bed_file(mapTree, 'test.bed') + #crossmap_gff_file(mapTree,'test.hg18.gtf',outfile = 'aa') + + + #crossmap_bam_file(mapTree, targetChromSizes, sys.argv[1], 'eg.abnormal.sam','out') + #printlog (['Sort output bam file ...']) + #pysam.sort('-m','1000000000','out.bam','out.sorted') + #printlog (['Index bam file ...']) + #pysam.index('out.sorted.bam','out.sorted.bam.bai') + + #crossmap_wig_file(mapTree, 'GSM686950_reverse.hg18.bw', 'GSM686950_reverse.hg19', sourceChromSizes, targetChromSizes, in_format = 'bigwig') + + #parser = optparse.OptionParser() + #parser.add_option("-i","--input-sam",action="store",type="string",dest="input_sam_file", help="BAM/SAM format file.") + #parser.add_option("-o","--out-file",action="store",type="string",dest="output_sam_file", help="Prefix of output files. Foramt is determined from input: SAM <=> SAM, BAM <=> BAM") + #parser.add_option("-m","--insert-mean",action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). Used to tell wheather a mapped pair is \"proper pair\" or not. [default=%default]") + #parser.add_option("-s","--insert-stdev",action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" ) + #parser.add_option("-t","--times-of-stdev",action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]") + #(options,args)=parser.parse_args() + + #print options + #print args + + + commands = { + 'bed':'convert genome cooridnate or annotation file in BED, bedGraph or other BED-like format.', + 'bam':'convert alignment file in BAM or SAM format.', + 'gff':'convert genome cooridnate or annotation file in GFF or GTF format.', + 'wig':'convert genome coordinate file in Wiggle, or bedGraph format.', + 'bigwig':'convert genome coordinate file in BigWig format.', + 'vcf':'convert genome coordinate file in VCF format.' + } + kwds = list(commands.keys()) + + + if len(sys.argv) == 1: + general_help() + sys.exit(0) + elif len(sys.argv) >=2: + # deal with bed input + if sys.argv[1].lower() == 'bed': + if len(sys.argv) == 4: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = None + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file, print_table = False) + crossmap_bed_file(mapTree, in_file, out_file) + + elif len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_bed_file(mapTree, in_file, out_file) + else: + bed_help() + sys.exit(0) + elif sys.argv[1].lower() == 'gff': + if len(sys.argv) == 4: + chain_file = sys.argv[2] + in_file = sys.argv[3] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_gff_file(mapTree, in_file, None) + elif len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_gff_file(mapTree, in_file, out_file) + else: + gff_help() + sys.exit(0) + elif sys.argv[1].lower() == 'wig': + if len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'wiggle') + else: + wig_help() + sys.exit(0) + elif sys.argv[1].lower() == 'bigwig': + if len(sys.argv) == 5: + chain_file = sys.argv[2] + + in_file = sys.argv[3] + try: + bw = pyBigWig.open(in_file) + except: + print ("\nPlease check if \"%s\" is in bigWig format!\n" % in_file, file=sys.stderr) + sys.exit(0) + + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'bigwig') + else: + bigwig_help() + sys.exit(0) + elif sys.argv[1].lower() == 'bam': + #def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3): + usage="CrossMap.py bam input_chain_file input_bam_file output_file [options]\nNote: If output_file == STDOUT or -, CrossMap will write BAM file to the screen" + parser = optparse.OptionParser(usage, add_help_option=False) + parser.add_option("-m", "--mean", action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). [default=%default]") + parser.add_option("-s", "--stdev", action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" ) + parser.add_option("-t", "--times", action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]") + parser.add_option("-a","--append-tags",action="store_true",dest="add_tags",help="Add tag to each alignment.") + (options,args)=parser.parse_args() + + if len(args) >= 3: + print("Insert size = %f" % (options.insert_size), file=sys.stderr) + print("Insert size stdev = %f" % (options.insert_size_stdev), file=sys.stderr) + print("Number of stdev from the mean = %f" % (options.insert_size_fold), file=sys.stderr) + if options.add_tags: + print("Add tags to each alignment = %s" % ( options.add_tags), file=sys.stderr) + else: + print("Add tags to each alignment = %s" % ( False), file=sys.stderr) + chain_file = args[1] + in_file = args[2] + out_file = args[3] if len(args) >= 4 else None + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + + if out_file in ["STDOUT","-"]: + out_file = None + if options.add_tags: + crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=True) + else: + crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=False) + else: + parser.print_help() + elif sys.argv[1].lower() == 'vcf': + if len(sys.argv) == 6: + chain_file = sys.argv[2] + in_file = sys.argv[3] + genome_file = sys.argv[4] + out_file = sys.argv[5] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_vcf_file(mapping = mapTree, infile= in_file, outfile = out_file, liftoverfile = sys.argv[2], refgenome = genome_file) + else: + vcf_help() + sys.exit(0) + else: + general_help() + sys.exit(0) + + + + + diff --git a/bin/Modules/CrossMapper.py b/bin/Modules/CrossMapper.py new file mode 100644 index 0000000..74ae97e --- /dev/null +++ b/bin/Modules/CrossMapper.py @@ -0,0 +1,40 @@ +import urllib.request +import os +from Modules import CrossMap + + +class CrossMapper: + def __init__(self, org, wd, out, is_dir): + self.org = org + if is_dir: + self.infile = os.path.join( wd + "/temp/" + org + ".gtf") + else: + self.infile = os.path.join(wd+"/data/temp/"+org+".gtf") + self.outfile = os.path.join(out+"/" + org + "_mapped.gtf") + self.chainfile = self.get_chain_file(org, wd, is_dir) + (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) + CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) + + def get_chain_file(self, org, wd, isdir): + + # Defines the Chain files for different conversions. + + if org == "hg19": + if isdir: + file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz") + else: + file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) + urllib.request.urlretrieve("http://hgdownload.soe.ucsc.edu/goldenPath/hg38/" + "liftOver/hg38ToHg19.over.chain.gz", + file_link) + return file_link + + elif org == "mm9": + if isdir: + file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz") + else: + file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) + urllib.request.urlretrieve("http://hgdownload.soe.ucsc.edu/goldenPath/mm10/" + "liftOver/mm10ToMm9.over.chain.gz", + file_link) + return file_link diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index e13aa6a..5425bd3 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -4,14 +4,19 @@ class ResultSaver: - def __init__(self, results, organism, tissue): + def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): print("Save results to File !") self.path = "" - if tissue: - self.path = os.path.join("./"+organism+"_filtered.gtf") + if mapped: + if is_data_dir: + self.path = os.path.join(wd + "temp/" + organism + ".gtf") + else: + self.path = os.path.join( wd + "data/temp/" + organism + ".gtf" ) + elif tissue: + self.path = os.path.join(out+"/"+organism+"_filtered.gtf") else: - self.path = os.path.join("./"+organism+".gtf") + self.path = os.path.join(out+"/"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') diff --git a/bin/Modules/ucsc/bigBedToBed b/bin/Modules/ucsc/bigBedToBed old mode 100644 new mode 100755 diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 6f0e11c..131b853 100755 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -1,23 +1,37 @@ -#!/usr/bin/env python3 - import argparse from Modules.Ensembl.Ensembl import Ensembl from Modules.ucsc.ucsc import UcscGtf from Modules.Uniquifier import UniqueFilter from Modules.SaveResults import ResultSaver +from Modules.CrossMapper import CrossMapper import os import json def check_for_local_folder(wd): - if not os.path.isdir(os.path.join(wd+"/EnsemblData")): + if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): os.mkdir(os.path.join(wd+"/data/EnsemblData")) if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): os.mkdir(os.path.join(wd+"/data/UCSCData")) + if not os.path.isdir(os.path.join(wd+"/data/temp")): + os.mkdir(os.path.join(wd+"/data/temp")) + + +def check_for_data_dir(data_dir): + + if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): + os.mkdir(os.path.join(data_dir+"/EnsemblData")) + + if not os.path.isdir(os.path.join(data_dir+"/UCSCData")): + os.mkdir(os.path.join(data_dir + "/UCSCData")) + + if not os.path.isdir(os.path.join(data_dir+"/temp")): + os.mkdir(os.path.join(data_dir+"/temp")) + def check_filter(tissue_cmd, org, wd): path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") @@ -36,9 +50,29 @@ def check_filter(tissue_cmd, org, wd): return False -def main_script(org, wd, data_dir, tissuetype=None): +def cross_map(data_dir, org): + pass + + +def check_organism(org): + if org == "hg38": + return "homo_sapiens", False + if org == "hg19": + print("Older assembly Version detected: hg19 -> Crossmapping result from hg38") + return "homo_sapiens", True + elif org == "mm10": + return "mus_musculus", False + elif org == "mm9": + print("Older assembly Version detected: mm9 -> Crossmapping result from mm10") + return "mus_musculus", True + + +def main_script(organism, wd, data_dir, out, tissuetype=None): + (org, x_mappable) = check_organism(organism) if not data_dir: check_for_local_folder(wd) + else: + check_for_data_dir(data_dir) if check_filter(tissuetype, org, wd): tissues = tissuetype print("Filter detected !") @@ -50,19 +84,30 @@ def main_script(org, wd, data_dir, tissuetype=None): ense = Ensembl(org, wd, data_dir) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) - ResultSaver(unique_filter.get_results(), org, tissues) + if data_dir: + ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, tissues, out) + if x_mappable: + CrossMapper(organism, data_dir, out, True) + else: + ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, tissues, out) + if x_mappable: + CrossMapper(organism, wd, out, False) if __name__ == '__main__': parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) - parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) + parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) + parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str) args = vars(parser.parse_args()) if args["organism"]: - print("Working Dir: " + args["wd"]) - main_script(args["organism"], args["wd"], args["dir"], args["tissue"]) + if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]: + print("Working Dir: " + args["wd"]) + main_script(args["organism"], args["wd"], args["dir"], args["out"], args["tissue"]) + else: + print("Invalid Organism: " + args["organism"] + " see -h for help") else: print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") diff --git a/masterenv.yml b/masterenv.yml index a977fe1..e0708db 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -24,4 +24,5 @@ dependencies: - biopython - pybedtools - matplotlib - - seaborn \ No newline at end of file + - seaborn + - crossmap \ No newline at end of file From a3c452de8b0f5f5fa7baf66de4042d0f291accee Mon Sep 17 00:00:00 2001 From: basti Date: Mon, 10 Dec 2018 16:29:49 +0100 Subject: [PATCH 100/158] Updated README. --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 203d55b..62695a7 100644 --- a/README.md +++ b/README.md @@ -83,15 +83,16 @@ Optional arguments: --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - Motif clustering: + Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) - Creating GTF: - --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) + Creating GTF: + --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) --wd PATH current working directory for the script (default: ".") --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") --out PATH directory for the output GTF (default: ".") + All arguments can be set in the configuration files. ``` From 55bd528f97fa8f81db04019eb89d8b11f93ff81d Mon Sep 17 00:00:00 2001 From: SebastianBeyvers Date: Mon, 10 Dec 2018 16:30:54 +0100 Subject: [PATCH 101/158] Update Readme --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 62695a7..ee8d49a 100644 --- a/README.md +++ b/README.md @@ -82,16 +82,16 @@ Optional arguments: --motif_max_len INT Maximum length of Motif (Default: 20) --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - - Motif clustering: + + Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) - Creating GTF: - --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) - --wd PATH current working directory for the script (default: ".") - --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") - --out PATH directory for the output GTF (default: ".") + Creating GTF: + --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) + --wd PATH current working directory for the script (default: ".") + --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") + --out PATH directory for the output GTF (default: ".") All arguments can be set in the configuration files. ``` From 479ddad98da87b08aadda480986a4dc35b0865ea Mon Sep 17 00:00:00 2001 From: basti Date: Mon, 10 Dec 2018 17:50:34 +0100 Subject: [PATCH 102/158] Added more comments! --- bin/Modules/CrossMapper.py | 9 +++++ bin/Modules/Ensembl/ActivityTable.py | 6 ++-- bin/Modules/Ensembl/ActivityTableGenerator.py | 4 +++ bin/Modules/Ensembl/Ensembl.py | 5 +++ bin/Modules/Ensembl/FTPHandling/FTPEntry.py | 7 ++-- .../Ensembl/FTPHandling/URLRetrieve.py | 4 +++ bin/Modules/Ensembl/GTFGen.py | 14 +++++++- bin/Modules/Ensembl/checksums.py | 12 +++---- bin/Modules/SaveResults.py | 8 +++++ bin/Modules/Uniquifier.py | 12 +++++++ bin/Modules/ucsc/ucsc.py | 11 +++++++ bin/RegGTFExtractor.py | 33 +++++++++++++++++-- 12 files changed, 108 insertions(+), 17 deletions(-) diff --git a/bin/Modules/CrossMapper.py b/bin/Modules/CrossMapper.py index 74ae97e..d401c79 100644 --- a/bin/Modules/CrossMapper.py +++ b/bin/Modules/CrossMapper.py @@ -4,6 +4,14 @@ class CrossMapper: + + """ + + Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions. + Utilizes CrossMap.py. see wiki for more information. + + """ + def __init__(self, org, wd, out, is_dir): self.org = org if is_dir: @@ -12,6 +20,7 @@ def __init__(self, org, wd, out, is_dir): self.infile = os.path.join(wd+"/data/temp/"+org+".gtf") self.outfile = os.path.join(out+"/" + org + "_mapped.gtf") self.chainfile = self.get_chain_file(org, wd, is_dir) + # Execute Crossmapper for gff/gtf files (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 040d15b..288bf7d 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -6,8 +6,8 @@ class ActivityTable: """ Class for checking activity_table and generating them. - ActivityTable = Byte Representation of Activity Status - corresponding to the generator Schema default: + activityTable = byte Representation of activity status + corresponding to the generator schema default: 0, "activity=ACTIVE", 1, "activity=POISED", 2, "activity=REPRESSED", @@ -28,6 +28,7 @@ def __init__(self, organism, current_release, wd, data_dir): "activity=NA"]) def check_and_generate_activity_table(self): + # checks if file already exists and generates new one if missing for subfolder in self.folders: folder_link = os.path.join(self.link, subfolder) sf_link = os.path.join(folder_link, "table.bin") @@ -37,6 +38,7 @@ def check_and_generate_activity_table(self): print("All ActivityTables found, proceeding") def generate_table(self, link): + # generates the table and saves it as table.bin file for root, dirs, files in os.walk(link): for file in files: if file.endswith(".gff.gz"): diff --git a/bin/Modules/Ensembl/ActivityTableGenerator.py b/bin/Modules/Ensembl/ActivityTableGenerator.py index 2c2cff6..f110236 100644 --- a/bin/Modules/Ensembl/ActivityTableGenerator.py +++ b/bin/Modules/Ensembl/ActivityTableGenerator.py @@ -3,6 +3,10 @@ class ATGenerator: + """ + Reads saved activity binary files (table.bin) + """ + def __init__(self, repre): self.representation = repre diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index de5839d..88adeb1 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -6,6 +6,11 @@ class Ensembl: + """ + Main class for handling Ensembl Regulatory data + Checks for local files and downloads if files are missing + """ + def __init__(self, organism, wd, data_dir): print("Starting Ensembl") self.updater = FTPRetriever(organism, wd, data_dir) diff --git a/bin/Modules/Ensembl/FTPHandling/FTPEntry.py b/bin/Modules/Ensembl/FTPHandling/FTPEntry.py index 0a90080..0d7bdc3 100644 --- a/bin/Modules/Ensembl/FTPHandling/FTPEntry.py +++ b/bin/Modules/Ensembl/FTPHandling/FTPEntry.py @@ -1,11 +1,12 @@ import ftplib -# Class to determine if a Ftp-path is file or directory. - - class FTPEntry: + """ + Class to determine if a ftp-path is file or directory. + """ + def __init__(self, filename, ftpobj, startingdir=None): self.filename = filename if startingdir is None: diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 43feb54..6b45598 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -4,6 +4,10 @@ class FTPHandler: + """ + Class to browse through ftp folders and download entries to local file + """ + def __init__(self, url, wd): self.ftp = ftplib.FTP(url) self.ftp.login() diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index 6a1a63b..0ba69bf 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -5,12 +5,20 @@ class GTFGen: + + """ + + Class to generate Ensembl GTF-data with activity + """ + def __init__(self, organism, release, wd, data_dir): self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} def get_organism_as_gff(self, organism, release, wd, data_dir): + + # reads the original gff file for organism if data_dir: directory = os.path.join(data_dir + "/EnsemblData/", release, organism) else: @@ -25,6 +33,8 @@ def get_organism_as_gff(self, organism, release, wd, data_dir): def reformat_to_gff(self, activity, release): + # Reformats gff to gtf and appends activity-data for config specified celltype-categories + gtf_return = [] for index, line in enumerate(self.gff_lines): @@ -57,16 +67,18 @@ def reformat_to_gff(self, activity, release): @staticmethod def generate_additional_information(id, activity): + # helper method to concat activity information to string return "; ".join([id, "activity="+", ".join(activity)]) def generate_activity_list(self, activity, index): + # generates activity list activity_list = [] for key, value in activity.items(): activity_list.append(key+">"+self.value_map[value[index]]) return activity_list def get_gtf(self, release, activity): - + # returns the resulting gtf-formatted-list return self.reformat_to_gff(activity, release) diff --git a/bin/Modules/Ensembl/checksums.py b/bin/Modules/Ensembl/checksums.py index 55209f9..8f8b92a 100644 --- a/bin/Modules/Ensembl/checksums.py +++ b/bin/Modules/Ensembl/checksums.py @@ -2,6 +2,10 @@ # Python implementation of linux sum (BSD 16-bit Checksum) commandline tool. +""" +Unused script with checksum implementations in Python +""" + def bsdchecksum(infile): with open(infile, 'rb') as f: file_bytes = f.read() @@ -17,11 +21,3 @@ def md5_checksum(infile): with open(infile, 'rb') as f: file_bytes = f.read() return hashlib.md5(file_bytes).hexdigest() - - -if __name__ == '__main__': - # print(bsdchecksum("/home/basti/Schreibtisch/tests/homo_sapiens.GRCh38.HMEC.Regulatory_Build.regulatory_activity.20161111.gff.gz")) - print(md5_checksum("/home/basti/Schreibtisch/tests/" - "mus_musculus.GRCm38.forebrain_embryonic_16_5_days." - "Regulatory_Build.regulatory_activity.20180516.gff.gz")) - diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 5425bd3..64ed514 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -4,6 +4,14 @@ class ResultSaver: + + """ + + class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False. + The output is saved to the temp directory in the data folder if crossmapping is necessary. + + """ + def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): print("Save results to File !") diff --git a/bin/Modules/Uniquifier.py b/bin/Modules/Uniquifier.py index 7d4925d..16d5538 100644 --- a/bin/Modules/Uniquifier.py +++ b/bin/Modules/Uniquifier.py @@ -1,5 +1,11 @@ class UniqueFilter: + """ + + Class to get unique GTF-results, filtered by specified cell-/tissuetypes + + """ + def __init__(self, ense, ucsc, org_filter=None): self.results = self.get_filtered_results(org_filter, ense, ucsc) @@ -7,6 +13,9 @@ def get_results(self): return self.results def get_filtered_results(self, org_filter, ense, ucsc): + + # Apply Filter + unfiltered_results = self.concat_without_duplicates(ense, ucsc) if org_filter: filterstrings = [x+">ACTIVE" for x in org_filter] @@ -21,6 +30,9 @@ def get_filtered_results(self, org_filter, ense, ucsc): @staticmethod def concat_without_duplicates(ense, ucsc): + + # Concat UCSC and Ensembl data without duplicates + results = ense+ucsc for ens in ense: for uc in ucsc: diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index c8ad466..cd55976 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -7,6 +7,11 @@ class UcscGtf: + """ + Class to gather ucsc refSeq-FuncElem data. + + """ + def __init__(self, org, wd, data_dir): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" @@ -23,10 +28,12 @@ def __init__(self, org, wd, data_dir): print("UCSC finished !") def generate_gff_file(self): + # Call bigBedToBed binary to get a Bed file in the UCSCData folder callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): + # Reads Bed File and return a gtf-formatted list of elements. gtf_lines = [] with open(self.output, 'r') as csvfile: tsvreader = csv.reader(csvfile, delimiter='\t') @@ -46,11 +53,13 @@ def read_gff_to_gtf(self): return gtf_lines def find_ID(self, line): + # Find RefSeq ID in Line pattern = re.compile(r'ID:[0-9]{,9}|$') return re.search( pattern, line).group() def get_activity(self, line): + # Find activity categories in bed file key_status = [] for key, value in self.ucsc_categories.items(): if value: @@ -64,6 +73,7 @@ def get_activity(self, line): @staticmethod def get_organism_id(org): + # convert intern name e.g. "homo_sapiens" to ucsc name "hg38". if org == "homo_sapiens": return "hg38" elif org == "mus_musculus": @@ -71,6 +81,7 @@ def get_organism_id(org): @staticmethod def get_activity_categories(organism, wd): + # Method to get ucsc-celltype categories from JSON config path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 131b853..14acaa5 100755 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -1,3 +1,13 @@ +""" + +RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases +and converts output to GTF-formatted file. + +@author: Sebastian Beyvers +@contact: sebastian.beyvers@med.uni-giessen.de + +""" + import argparse from Modules.Ensembl.Ensembl import Ensembl from Modules.ucsc.ucsc import UcscGtf @@ -10,6 +20,8 @@ def check_for_local_folder(wd): + # Check if local folder exists and create if missing when no data_dir is specified + if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): os.mkdir(os.path.join(wd+"/data/EnsemblData")) @@ -23,6 +35,8 @@ def check_for_local_folder(wd): def check_for_data_dir(data_dir): + # Check if local folder exists and create if missing when data_dir as parameter is specified + if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): os.mkdir(os.path.join(data_dir+"/EnsemblData")) @@ -34,6 +48,9 @@ def check_for_data_dir(data_dir): def check_filter(tissue_cmd, org, wd): + + # Checks if filter-celltype is in Json types for organism + path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") tissues_config = [] if not tissue_cmd: @@ -50,11 +67,10 @@ def check_filter(tissue_cmd, org, wd): return False -def cross_map(data_dir, org): - pass +def check_organism(org): + # Checks the organism input and decides if chrossmapping is necessary -def check_organism(org): if org == "hg38": return "homo_sapiens", False if org == "hg19": @@ -68,6 +84,9 @@ def check_organism(org): def main_script(organism, wd, data_dir, out, tissuetype=None): + + # Main function + (org, x_mappable) = check_organism(organism) if not data_dir: check_for_local_folder(wd) @@ -80,7 +99,9 @@ def main_script(organism, wd, data_dir, out, tissuetype=None): tissues = None print("Filter not detected !") + # Get UCSC Data ucsc = UcscGtf(org, wd, data_dir) + # Gen Ensembl Data ense = Ensembl(org, wd, data_dir) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) @@ -95,6 +116,9 @@ def main_script(organism, wd, data_dir, out, tissuetype=None): if __name__ == '__main__': + + # Argumentparser + parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) @@ -102,6 +126,9 @@ def main_script(organism, wd, data_dir, out, tissuetype=None): parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str) args = vars(parser.parse_args()) + + # Check if organism exists + if args["organism"]: if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]: print("Working Dir: " + args["wd"]) From 22a36a5ab1a185a1446b39c6632a4fa669912a89 Mon Sep 17 00:00:00 2001 From: basti Date: Mon, 10 Dec 2018 17:59:05 +0100 Subject: [PATCH 103/158] Changed Readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee8d49a..6853199 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,6 @@ Required arguments: --bed Path to BED-file with peaks of interest corresponding to the BigWig file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format - --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] Optional arguments: @@ -92,6 +91,7 @@ Optional arguments: --wd PATH current working directory for the script (default: ".") --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") --out PATH directory for the output GTF (default: ".") + --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] (Default: hg38) All arguments can be set in the configuration files. ``` From 69434e2671fe055250e3408d2cf3a8322dca564c Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 11 Dec 2018 12:34:10 +0100 Subject: [PATCH 104/158] Changed Readme and updated mm_config --- README.md | 4 +- bin/config/celltypes_mus_musculus.json | 270 +++++++++++-------------- 2 files changed, 123 insertions(+), 151 deletions(-) diff --git a/README.md b/README.md index 6853199..713312f 100644 --- a/README.md +++ b/README.md @@ -87,10 +87,8 @@ Optional arguments: --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) Creating GTF: - --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) + --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) --wd PATH current working directory for the script (default: ".") - --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") - --out PATH directory for the output GTF (default: ".") --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] (Default: hg38) All arguments can be set in the configuration files. diff --git a/bin/config/celltypes_mus_musculus.json b/bin/config/celltypes_mus_musculus.json index 0f02cde..79a574c 100644 --- a/bin/config/celltypes_mus_musculus.json +++ b/bin/config/celltypes_mus_musculus.json @@ -1,179 +1,153 @@ [ { - "type": "A549", - "alias_ucsc": [], - "alias_ensembl": ["A549"] - }, - { - "type": "Aorta", - "alias_ucsc": ["branchial arch"], - "alias_ensembl": ["Aorta"] - }, - { - "type": "Thymus", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Thymus", "Thymus"] - }, - { - "type": "B-Cells", - "alias_ucsc": [], - "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] - }, - { - "type": "T-Cell", - "alias_ucsc": [], - "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] - }, - { - "type": "Monocyte", - "alias_ucsc": [], - "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] - }, - { - "type": "Neutrophil", - "alias_ucsc": [], - "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] - }, - { - "type": "Eosinophil", - "alias_ucsc": [], - "alias_ensembl": ["eosinophil_VB"] - }, - { - "type": "Macrophage", - "alias_ucsc": [], - "alias_ensembl": [ - "M0_macrophage_CB", - "M0_macrophage_VB", - "M1_macrophage_CB", - "M1_macrophage_VB", - "M2_macrophage_CB", - "M2_macrophage_VB" - ] - }, - { - "type": "Erythroblast", - "alias_ucsc": [], - "alias_ensembl": ["erythroblast_CB"] + "type": "Heart", + "alias_ucsc": ["heart"], + "alias_ensembl": ["heart_adult_8_weeks", "heart_embryonic_10_5_days", + "heart_embryonic_11_5_days", "heart_embryonic_12_5_days", + "heart_embryonic_13_5_days", "heart_embryonic_14_5_days", + "heart_embryonic_15_5_days", "heart_embryonic_16_5_days", + "heart_postnatal_0_days"] + }, + { + "type": "Facial_Prominence", + "alias_ucsc": [], + "alias_ensembl": ["embryonic_facial_prominence_embryonic_10_5_days", + "embryonic_facial_prominence_embryonic_11_5_days", + "embryonic_facial_prominence_embryonic_12_5_days", + "embryonic_facial_prominence_embryonic_13_5_days", + "embryonic_facial_prominence_embryonic_14_5_days", + "embryonic_facial_prominence_embryonic_15_5_days"] + }, + { + "type": "Forebrain", + "alias_ucsc": ["forebrain"], + "alias_ensembl": ["forebrain_embryonic_10_5_days", + "forebrain_embryonic_11_5_days", + "forebrain_embryonic_12_5_days", + "forebrain_embryonic_13_5_days", + "forebrain_embryonic_14_5_days", + "forebrain_embryonic_15_5_days", + "forebrain_embryonic_16_5_days", + "forebrain_postnatal_0_days"] + }, + { + "type": "Hindbrain", + "alias_ucsc": ["hindbrain (rhombencephalon)"], + "alias_ensembl": ["hindbrain_embryonic_10_5_days", + "hindbrain_embryonic_11_5_days", + "hindbrain_embryonic_12_5_days", + "hindbrain_embryonic_13_5_days", + "hindbrain_embryonic_14_5_days", + "hindbrain_embryonic_15_5_days", + "hindbrain_embryonic_16_5_days", + "hindbrain_postnatal_0_days" + ] + }, + { + "type": "Midbrain", + "alias_ucsc": ["midbrain (mesencephalon)"], + "alias_ensembl": ["midbrain_embryonic_10_5_days", + "midbrain_embryonic_11_5_days", + "midbrain_embryonic_12_5_days", + "midbrain_embryonic_13_5_days", + "midbrain_embryonic_14_5_days", + "midbrain_embryonic_15_5_days", + "midbrain_embryonic_16_5_days", + "midbrain_postnatal_0_days" + ] + }, + { + "type": "NeuralTube", + "alias_ucsc": ["neural tube"], + "alias_ensembl": ["neural_tube_embryonic_11_5_days", + "neural_tube_embryonic_12_5_days", + "neural_tube_embryonic_13_5_days", + "neural_tube_embryonic_14_5_days", + "neural_tube_embryonic_15_5_days"] }, { "type": "Intestine", "alias_ucsc": [], - "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] - }, - { - "type": "AdrenalGland", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Adrenal_Gland"] - }, - { - "type": "Muscle", - "alias_ucsc": ["limb"], - "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] - }, - { - "type": "Gastric", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Stomach", "Gastric"] - }, - { - "type": "Endothelial", - "alias_ucsc": ["blood vessels"], - "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] + "alias_ensembl": ["intestine_embryonic_14_5_days", + "intestine_embryonic_15_5_days", + "intestine_embryonic_16_5_days", + "intestine_postnatal_0_days"] }, { - "type": "StemCells", + "type": "Kidney", "alias_ucsc": [], - "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] - }, - { - "type": "Lung", - "alias_ucsc": [], - "alias_ensembl": ["Lung", "IMR90", "NHLF"] - }, - { - "type": "Pancreas", - "alias_ucsc": ["pancreas"], - "alias_ensembl": ["Pancreas"] + "alias_ensembl": ["kidney_adult_8_weeks", + "kidney_embryonic_14_5_days", + "kidney_embryonic_15_5_days", + "kidney_embryonic_16_5_days", + "kidney_postnatal_0_days"] }, { "type": "Liver", "alias_ucsc": ["liver"], - "alias_ensembl": [] - }, - { - "type": "Ovary", - "alias_ucsc": [], - "alias_ensembl": ["Ovary"] + "alias_ensembl": ["liver_embryonic_11_5_days", + "liver_embryonic_12_5_days", + "liver_embryonic_13_5_days", + "liver_embryonic_14_5_days", + "liver_embryonic_15_5_days", + "liver_embryonic_16_5_days", + "liver_adult_8_weeks", + "liver_postnatal_0_days"] }, { - "type": "Placenta", - "alias_ucsc": [], - "alias_ensembl": ["Placenta"] - }, - { - "type": "Spleen", - "alias_ucsc": [], - "alias_ensembl": ["Spleen"] - }, - { - "type": "Heart", - "alias_ucsc": ["heart"], - "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] - }, - { - "type": "Osteoblast", - "alias_ucsc": [], - "alias_ensembl": ["Osteobl"] - }, - { - "type": "Fibroblast", - "alias_ucsc": [], - "alias_ensembl": ["NHDF_AD"] - }, - { - "type": "NK-Cells", - "alias_ucsc": [], - "alias_ensembl": ["Natural_Killer_cells_PB"] - }, - { - "type": "Cancers", - "alias_ucsc": [], - "alias_ensembl": ["HeLa_S3", "HepG2", "DND_41", "K562"] + "type": "Limb", + "alias_ucsc": ["limb"], + "alias_ensembl": ["limb_embryonic_10_5_days", + "limb_embryonic_11_5_days", + "limb_embryonic_12_5_days", + "limb_embryonic_13_5_days", + "limb_embryonic_14_5_days", + "limb_embryonic_15_5_days"] }, { - "type": "Brain", - "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], - "alias_ensembl": ["NH_A"] + "type": "Lung", + "alias_ucsc": [], + "alias_ensembl": ["lung_embryonic_14_5_days", + "lung_embryonic_15_5_days", + "lung_embryonic_16_5_days", + "lung_postnatal_0_days" + ] }, { - "type": "Mesenchym", - "alias_ucsc": ["mesenchyme derived from neural crest", "facial mesenchyme"], - "alias_ensembl": [] + "type": "Stomach", + "alias_ucsc": [], + "alias_ensembl": ["stomach_embryonic_14_5_days", + "stomach_embryonic_15_5_days", + "stomach_embryonic_16_5_days", + "stomach_postnatal_0_days"] }, { - "type": "Embryonal", - "alias_ucsc": ["somite", "genital tubercle"], - "alias_ensembl": [] + "type": "MEL", + "alias_ucsc": [], + "alias_ensembl": ["MEL_cell_line"] }, { - "type": "Eye", - "alias_ucsc": ["eye"], - "alias_ensembl": [] + "type": "Spleen", + "alias_ucsc": [], + "alias_ensembl": ["spleen_adult_8_weeks"] }, { - "type": "Nose", - "alias_ucsc": ["nose"], - "alias_ensembl": [] + "type": "Bruce", + "alias_ucsc": [], + "alias_ensembl": ["ES_Bruce4_embryonic"] }, { - "type": "Tail", - "alias_ucsc": ["tail"], - "alias_ensembl": [] + "type": "Ganglion", + "alias_ucsc": ["dorsal root ganglion", "trigeminal V (ganglion, cranial)", "cranial nerve"], + "alias_ensembl": [] }, { - "type": "Melanocytes", - "alias_ucsc": ["melanocytes"], - "alias_ensembl": [] + "type": "Other", + "alias_ucsc": ["nose", "branchial arch", + "ear", "somite", "tail", "eye", + "facial mesenchyme", "other", "melanocytes", + "blood vessels", "genital tubercle"], + "alias_ensembl": [] } + ] From 5f849b82cd0ed312121e2769944194881919ebce Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 11 Dec 2018 12:52:49 +0100 Subject: [PATCH 105/158] data error fixed --- bin/RegGTFExtractor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 14acaa5..355d4da 100755 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -22,6 +22,10 @@ def check_for_local_folder(wd): # Check if local folder exists and create if missing when no data_dir is specified + if not os.path.isdir(os.path.join(wd+"/data/")): + + os.mkdir(os.path.join(wd+"/data/")) + if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): os.mkdir(os.path.join(wd+"/data/EnsemblData")) @@ -37,6 +41,9 @@ def check_for_data_dir(data_dir): # Check if local folder exists and create if missing when data_dir as parameter is specified + if not os.path.isdir(os.path.join(data_dir)): + os.mkdir(os.path.join(data_dir)) + if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): os.mkdir(os.path.join(data_dir+"/EnsemblData")) From 476c25d420bd297512a6ecb8090e3f3177dcd4e4 Mon Sep 17 00:00:00 2001 From: basti Date: Tue, 11 Dec 2018 13:48:29 +0100 Subject: [PATCH 106/158] error in output fixed --- bin/Modules/SaveResults.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 64ed514..d285c25 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -18,9 +18,9 @@ def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): self.path = "" if mapped: if is_data_dir: - self.path = os.path.join(wd + "temp/" + organism + ".gtf") + self.path = os.path.join(wd + "/temp/" + organism + ".gtf") else: - self.path = os.path.join( wd + "data/temp/" + organism + ".gtf" ) + self.path = os.path.join( wd + "/data/temp/" + organism + ".gtf" ) elif tissue: self.path = os.path.join(out+"/"+organism+"_filtered.gtf") else: From 7357dc2cf70fc39060bae1e7d7808ad1c19b69be Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 12 Dec 2018 12:20:57 +0100 Subject: [PATCH 107/158] Fixed small bug in output formatting for mm10/mm9 --- bin/Modules/Ensembl/GTFGen.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index 0ba69bf..9753d39 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -66,9 +66,13 @@ def reformat_to_gff(self, activity, release): return gtf_return @staticmethod - def generate_additional_information(id, activity): + def generate_additional_information(gene_id, activity): + + if gene_id.startswith("ID=regulatory_region:"): + gene_id = "ID="+gene_id.split(":")[1] + # helper method to concat activity information to string - return "; ".join([id, "activity="+", ".join(activity)]) + return "; ".join([gene_id, "activity="+", ".join(activity)]) def generate_activity_list(self, activity, index): # generates activity list From 367dd33e5e05272fb2e17fb34a6ed27ccd9f7901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:53:00 -0500 Subject: [PATCH 108/158] updated README --- README.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 713312f..e6b69e9 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ export PATH=[meme-suite instalation path]/bin:$PATH ``` -Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). +Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. On some systems Nextflow exits the run with following error: ``` @@ -35,23 +35,26 @@ Run the following commands to create the enviroment: path=[Path to given masterenv.yml file] conda env create --name masterenv -f=$path ``` -When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. +When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. ## Quick Start ```console -nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] +nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] ``` ## Parameters For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --input Path to BigWig-file with scores on the peaks of interest + --bigwig Path to BigWig-file with scores on the peaks of interest --bed Path to BED-file with peaks of interest corresponding to the BigWig file --genome_fasta Path to genome in FASTA-format - --jaspar_db Path to motif-database in MEME-format + --motif_db Path to motif-database in MEME-format Optional arguments: + + --tfbs_path Path to directory with output BED-files from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: --window_length INT (Default: 200) a length of a window --step INT (Default: 100) an interval to slide the window @@ -77,24 +80,22 @@ Optional arguments: --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: - --motif_min_len INT Minimum length of Motif (Default: 8) - --motif_max_len INT Maximum length of Motif (Default: 20) - --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) + --motif_min_key INT Maximum number of key positions (aligned columns) (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) (Default: 20) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - + Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) + --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) - Creating GTF: - --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) - --wd PATH current working directory for the script (default: ".") - --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] (Default: hg38) - All arguments can be set in the configuration files. ``` For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) - From 9026ad9a1f7e0755eae14582a31c9c09bcf605e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:53:29 -0500 Subject: [PATCH 109/158] minor bugfix --- bin/compareBed.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index ee65fb4..f635283 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -68,6 +68,11 @@ case $key in shift shift ;; + -p|--path) + path="$2" + shift + shift + ;; -h|--help) help=true he=true @@ -188,7 +193,7 @@ do fi echo $i done -else +else declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) for i in ${motiffiles[@]} do @@ -215,7 +220,7 @@ cat "$workdir"/pass1Fa.bed | wc -l echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla /bin/maxScore.R "$workdir"/pass1Fa.bed +Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" @@ -254,13 +259,13 @@ if [ $help == false ] then cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed fi -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript --vanilla /bin/merge.R $min $max "$workdir" $data +Rscript --vanilla $path/merge.R $min $max "$workdir" $data #5. add fasta sequences to bed and create fasta file @@ -269,4 +274,3 @@ bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up #rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed - From 023bf564bbee758681eb436b7437698b03a7082f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:54:22 -0500 Subject: [PATCH 110/158] added T parameter for sorting --- bin/tfbsscan.py | 100 ++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/bin/tfbsscan.py b/bin/tfbsscan.py index 2bae662..f17ebf6 100644 --- a/bin/tfbsscan.py +++ b/bin/tfbsscan.py @@ -68,7 +68,7 @@ def check_directory(directory): if not os.path.exists(directory): os.makedirs(directory) logger.info('a new directory ' + directory + ' was created') - + #merge the whole genome with the regions mentioned in .bed file def merge(genome, bed_file, output_directory): logger.info('the merging of files ' + genome + ' and ' + bed_file + ' will end soon, the result file is output_merge.fa') @@ -79,9 +79,9 @@ def merge(genome, bed_file, output_directory): #split the motifs each in other file def split_motifs(motifs, output_directory, usage): logger.info("the file with motifs " + motifs + " will be checked for motifs and if needed splitted in files each containing only one motif") - + first_line = subprocess.getoutput("head -1 " + motifs) #find the first line of the input file - + if usage == "moods": if first_line.startswith(">"): #the motif file probably has the .pfm format, try to read and split it @@ -94,7 +94,7 @@ def split_motifs(motifs, output_directory, usage): if first_line.startswith("MEME version"): #the motifs file has probably the MEME format, try to read and split it splitted_motifs = read_meme(motifs, output_directory) - + #if the there was a convertion before, delete all the .pfm files as we don't need them for filename in os.listdir(output_directory): if filename.endswith(".pfm"): @@ -128,13 +128,13 @@ def read_pfm(motifs, output_directory): motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) motif_id = re.split(' ', line[1:])[0] #[1:] meands do not use the first character motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') - + splitted_motifs.append(motif_name) written_file = open(motif_name, 'w') - + if lines >= 1 and lines <= 4: #one motif has 5 lines, the first consists the name, the next 4 - the information we need to proceed the data within moods motif.append(line) - + lines = lines + 1 written_file.write(''.join(motif)) written_file.close() @@ -169,7 +169,7 @@ def read_meme(motifs, output_directory): motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) motif_id = re.split(' ', line)[1] motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.meme') - else: + else: motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) motif_name = os.path.join(output_directory, motif_alternate_name + '.meme') @@ -182,7 +182,7 @@ def read_meme(motifs, output_directory): motif.append(line) lines = lines + 1 - + #write the last motif written_file.write(''.join(motif)) written_file.close() @@ -190,7 +190,7 @@ def read_meme(motifs, output_directory): read_file.close() return splitted_motifs - + def convert_meme_to_pfm(motifs, output_directory): #i can only convert the file to pfm if the motifs file is in MEME format @@ -206,7 +206,7 @@ def convert_meme_to_pfm(motifs, output_directory): else: #search for motifs and save each to another file if line.startswith("MOTIF"): - + if 'written_file' in locals(): for row in rows: written_file.write('\t'.join(row) + '\n') @@ -214,22 +214,22 @@ def convert_meme_to_pfm(motifs, output_directory): rows = [[] for row in range(4)] written_file.close() - + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) motif_id = re.split(' ', line)[1] motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') - else: + else: motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) motif_name = os.path.join(output_directory, motif_alternate_name + '.pfm') - + #make a list with all the motif names to know which files to iterate when fimo is called splitted_motifs.append(motif_name) written_file = open(motif_name, 'w') - + elif line.startswith("letter-probability matrix"): columns = int(re.split(' ', re.split('w= ', line)[1])[0]) #find the number of columns from the line out of the motifs file nsites = int(re.split(' ', re.split('nsites= ', line)[1])[0]) #find the nsites to count the frequency count for .pfm file @@ -239,10 +239,10 @@ def convert_meme_to_pfm(motifs, output_directory): rows[i].append(str(round(float(re.findall(r'\S+', line)[i])*nsites))) #split the line, do not mention how much whitespaces are in between, multiply it with nsites and save it to the corresponding row lines = lines + 1 - + #write the last motif for row in rows: - written_file.write('\t'.join(row) + '\n') + written_file.write('\t'.join(row) + '\n') written_file.close() read_file.close() @@ -286,26 +286,26 @@ def call_fimo(fimo_data, p_value, one_motif, genome, output_directory): fimo_output_unsorted = os.path.join(output_directory, "fimo_output_unsorted_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") #check if user needs special options for the fimo - if fimo_data != None: + if fimo_data != None: fimo_data = fimo_data + " --thresh " + str(p_value) + " " - else: + else: fimo_data = "--thresh " + str(p_value) + " " #add the passed p value to the fimo options - #call fimo for this motif and save the output to a temporary file + #call fimo for this motif and save the output to a temporary file send_to_fimo = "fimo --text --no-qvalue " + fimo_data + one_motif + " " + genome + " > " + fimo_output_unsorted logger.info('fimo proceed the data using this call ' + send_to_fimo) fimo_stdout = subprocess.getoutput(send_to_fimo) - + #the normal output from fimo starts with Using motif ..., so print it from the logger, otherwise print what else fimo says if fimo_stdout.startswith("Using") and re.split('\n', fimo_stdout)[1]: logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[0].rstrip()) logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[1].rstrip()) else: #there were some problems with fimo, so we want to see what they were logger.info('info from fimo: ' + fimo_stdout) - - + + if not os.path.isfile(fimo_output_unsorted): logger.info('the usage of fimo was crashed, there is no required output file, the exit is forced') sys.exit() @@ -315,12 +315,13 @@ def call_fimo(fimo_data, p_value, one_motif, genome, output_directory): return fimo_output_unsorted else: #if the file was converted from pfm, the second column contains the positions, so we want to sort using this column, and not the next one + tmp_path = os.path.join(output_directory, "tmp/") second_line = subprocess.getoutput("sed -n '2{p;q}' " + fimo_output_unsorted) if re.split('\t', second_line)[2].startswith("chr"): #the re.split[1] is a ' ', so take the [2] - os.system("cat " + fimo_output_unsorted + " | sort -k 2 -V > " + fimo_output_file) + os.system("cat " + fimo_output_unsorted + " | sort -T " + tmp_path + " -k 2 -V > " + fimo_output_file) else: #we are sorting after the third column, which looks like chr1:123-126, -V means it will sort the digitals and not the strings - os.system("cat " + fimo_output_unsorted + " | sort -k 3 -V > " + fimo_output_file) + os.system("cat " + fimo_output_unsorted + " | sort -T " + tmp_path + " -k 3 -V > " + fimo_output_file) #make sure the output of fimo exists if not os.path.isfile(fimo_output_file): @@ -373,7 +374,7 @@ def call_moods(one_motif, genome, output_directory, p_value, moods_bg): with open(genome) as handle: seq_iterator = bio.SimpleFastaParser(handle) - + for header, seq in seq_iterator: header_splitted = re.split(r':', header) @@ -393,7 +394,7 @@ def call_moods(one_motif, genome, output_directory, p_value, moods_bg): fr = results[:len(matrix_names)] #forward strand rr = results[len(matrix_names):] #reverse strand - results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + + results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + [(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names))] #use + and - to indicate strand for (matrix, matrix_name, result) in zip(matrices, matrix_names, results): @@ -415,16 +416,17 @@ def call_moods(one_motif, genome, output_directory, p_value, moods_bg): if key_for_bed_dict != '': start = pos + 1 end = pos + len(hitseq) - chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file + chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file else: start = int(positions[0]) + pos + 1 end = start + len(hitseq) - 1 - + #moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, str(score)]) + '\n') moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, score]) + '\n') #now sort the output of moods - os.system("cat " + moods_output_unsorted_name + " | sort -k 1 -V > " + moods_output_name) + tmp_path = os.path.join(output_directory, "tmp/") + os.system("cat " + moods_output_unsorted_name + " | sort -T " + tmp_path + " -k 1 -V > " + moods_output_name) moods_output_file_unsorted.close() moods_output_file.close() @@ -469,10 +471,10 @@ def write_output_file(input_file, bed_dictionary, resolve_overlaps): logger.info('writing the output file ' + output_file_name) output_file = open(output_file_name, 'w') - + #calculate the real positions of TFs and write the information in the output file with open(input_file) as read_file: - + overlap = [] printed_line = [] last_line = [] @@ -480,11 +482,11 @@ def write_output_file(input_file, bed_dictionary, resolve_overlaps): for line in read_file: if not line.startswith('#'): - + line_to_write = [] - + line_array = re.split(r'\t', line.rstrip('\n')) - + chromosom_and_positions = re.split(r':', line_array[2]) if len(chromosom_and_positions) == 1: #the whole genome was given, there is nothing to split chromosom = line_array[2] @@ -494,8 +496,8 @@ def write_output_file(input_file, bed_dictionary, resolve_overlaps): positions = re.split(r'-', chromosom_and_positions[-1]) chromosom = chromosom_and_positions[0] start = str(int(positions[0]) + int(line_array[3])) - end = str(int(positions[0]) + int(line_array[4])) - key_for_bed_dict = line_array[2] #use only if there is a bed_dictionary in input + end = str(int(positions[0]) + int(line_array[4])) + key_for_bed_dict = line_array[2] #use only if there is a bed_dictionary in input #------- these are 5 needed columns to succesfully proceed the data @@ -526,7 +528,7 @@ def write_output_file(input_file, bed_dictionary, resolve_overlaps): write_line_not_overlap(output_file, line_to_write) - if not last_line[0].startswith('write'): #it could be that the write flag was deleted from the last_line so append it back + if not last_line[0].startswith('write'): #it could be that the write flag was deleted from the last_line so append it back overlap.insert(0, "write") #if there is already any printed line, check if the saved last line was already printed. Otherwise print it @@ -549,14 +551,14 @@ def check_overlap(line_to_write, overlap, printed_line, output_file): if not overlap: #if the overlap list is empty is_overlap = False - + else: #if the overlap list is not empty if not overlap[0].startswith('write'): #it could be that the write flag was deleted from the overlap so append it back to make sure the next if clauses run right overlap.insert(0, "write") if line_to_write[1] == overlap[1] and float(line_to_write[2]) < float(overlap[3]): #the current line could overlap the previous because the start of the current line is smaller than the end of the previous one and they are both on the one chromosom - #if p value in the current line is smaller than p value in the previous line (or score is bigger), save the current line as possible overlap for future + #if p value in the current line is smaller than p value in the previous line (or score is bigger), save the current line as possible overlap for future if float(line_to_write[5]) > float(overlap[5]): is_overlap = False else: @@ -589,7 +591,7 @@ def clean_directory(cleans, output_directory, motif, tool_output_file): fimo_output_unsorted = os.path.join(tool_output_file.replace("fimo_output", "fimo_output_unsorted")) moods_output_unsorted = os.path.join(tool_output_file.replace("moods_output", "moods_output_unsorted")) - + for clean in cleans: if clean == 'all': remove_file(motif) @@ -648,7 +650,7 @@ def multiprocess(motifs, genome, output_directory, cleans, fimo_data, p_value, b pool.close() pool.join() #make sure all the processes are done and exit - + #the processes should not delete the merged genome file #so make sure if this file is needed, otherwise delete it for clean in cleans: @@ -671,7 +673,7 @@ def make_bed_dictionary(bed_file): key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2] value = [] for i in range(3, len(bed_line_array)): - value.append(bed_line_array[i]) + value.append(bed_line_array[i]) bed_dictionary[key] = value else: #this is not a bed file, force exit @@ -721,7 +723,7 @@ def main(): args = parse_args() - + if args.use == "fimo": check_fimo_version() @@ -736,20 +738,20 @@ def main(): check_directory(args.output_directory) splitted_motifs = split_motifs(args.motifs, args.output_directory, args.use) - + #check if there is a .bed file to merge the genome file with. If so, merge them if args.bed_file: bed_dictionary = make_bed_dictionary(args.bed_file) args.genome = merge(args.genome, args.bed_file, args.output_directory) else: bed_dictionary = {} - + #if the usage is moods, call moods, otherwise call fimo multiprocess(splitted_motifs, args.genome, args.output_directory, args.cleans, args.fimo, args.p_value, bed_dictionary, args.cores, args.resolve_overlaps, args.use, args.moods_bg) - + for handler in logger.handlers: handler.close() logger.removeFilter(handler) - + if __name__ == "__main__": - main() \ No newline at end of file + main() From 8c939dedb0640c3c2a255bac1186cb37db097b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:54:59 -0500 Subject: [PATCH 111/158] removed parameter --- config/create_gtf.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/create_gtf.config b/config/create_gtf.config index d1bbb12..c0718a6 100644 --- a/config/create_gtf.config +++ b/config/create_gtf.config @@ -1,4 +1,4 @@ params{ - organism="homo_sapiens" + organism="hg38" tissue="" } From d6f98989c1292b3750e1899ce9b9d6b30ad18081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:55:37 -0500 Subject: [PATCH 112/158] tfbsscan optional; minor fixes/changes --- pipeline.nf | 87 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 516b7da..18b6745 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,11 +1,12 @@ #!/usr/bin/env nextflow //setting default values -params.input="" +params.bigwig="" params.bed="" params.genome_fasta="" -params.jaspar_db="" +params.motif_db="" params.config="" +params.tfbs_path="" //peak_calling params.window_length = 200 @@ -36,9 +37,9 @@ params.config="" params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 //glam2 - params.motif_min_len = 8 // Minimum length of Motifs - params.motif_max_len = 20 // Maximum length of Motifs - params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. + params.motif_min_key = 8 // Minimum number of key positions (aligned columns) + params.motif_max_key = 20 // Maximum number of key positions (aligned columns) + params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. @@ -50,22 +51,26 @@ params.config="" params.best_motif = 3 // Top n motifs per cluster //creating_gtf - params.organism="homo_sapiens" + params.organism="hg38" params.tissue="" + params.tmpdir = "/mnt/workspace1/rene.wiegandt/tmp/" -if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ +if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ -Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] +Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] Required arguments: - --input Path to BigWig-file + --bigwig Path to BigWig-file --bed Path to BED-file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format Optional arguments: + + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: --window_length INT (Default: 200) --step INT (Default: 100) @@ -93,7 +98,7 @@ Optional arguments: Motif estimation: --motif_min_len INT Minimum length of Motif (Default: 8) --motif_max_len INT Maximum length of Motif (Default: 20) - --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Moitf clustering: @@ -101,19 +106,20 @@ Optional arguments: --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [homo_sapiens | mus_musculus] - --tissues + --organism [homo_sapiens | mus_musculus] + --tissues All arguments can be set in the configuration files. """ } else { - Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} + Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} Channel.fromPath(params.bed).set {bed_input} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} - Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} + Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} } +path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin bigwig_input.combine(bed_input).set{footprint_in} /* @@ -123,8 +129,8 @@ process footprint_extraction { conda "${path_env}" tag{name} - publishDir "${out}", mode: 'copy', pattern: '*.bed' - publishDir "${out}/log", mode: 'copy', pattern: '*.log' + publishDir "${out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' + publishDir "${out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -143,23 +149,38 @@ process footprint_extraction { */ process extract_known_TFBS { + conda "${path_env}" + publishDir "${out}/known_TFBS/", mode: 'copy', pattern: '*.bed' + input: file (fasta) from fa_overlap file (db) from db_for_motivscan output: - file ('*.bed') into known_TFBS_for_overlap + val ('done') into known_TFBS_for_overlap + //file ('*.bed') into known_TFBS_for_overlap + + when: + params.tfbs_path == "" script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o . + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.tmpdir} """ } -bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap} +if(params.tfbs_path == "") { + bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} + motif_path = params.tmpdir +} else { + Channel.from('skipped').set {workaround} + bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap} + tfbs = params.tfbs_path?.endsWith('/') ? params.tfbs_path: params.tfbs_path.substring( 0, params.tfbs_path.length() -1 ) + motif_path = tfbs +} /* @@ -167,19 +188,22 @@ bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2). process overlap_with_known_TFBS { conda "${path_env}" + publishDir "${out}/unknown_overlap/", mode: 'copy', pattern: '*.bed' + input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: - set name, file ('*.bed') into bed_for_reducing + set name, file ("${name}.bed") into bed_for_reducing + set name, file ("${name}.bed") into testetst script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} """ } - +testetst.println() /* */ @@ -204,6 +228,8 @@ process reduce_bed { process clustering { conda "${path_env}" + publishDir "${out}/cluster/", mode: 'copy', pattern: '*.bed' + input: set name, file (bed) from bed_for_clustering @@ -221,6 +247,7 @@ process clustering { Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { + conda "${path_env}" tag{name} @@ -262,7 +289,7 @@ process glam2 { script: """ - glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } @@ -312,6 +339,7 @@ files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) process merge_fasta { + conda "${path_env}" input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta @@ -334,18 +362,20 @@ motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() process clustered_glam2 { + publishDir "${out}/esimated_motifs/${name}/", mode: 'copy' + input: file (fasta) from motif_clustered_fasta_flat output: set name, file ('*.meme') into clustered_meme_for_tomtom - set name, file ('*.meme') into clustered_meme_for_filter - file('*') + set name, file ('*.meme') into clustered_meme_for_filter + file('*') script: name = fasta.getBaseName() """ - glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } @@ -357,6 +387,7 @@ Tomtom searches motifs in databases. process tomtom { tag{name} + publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy' input: set name, file (meme) from clustered_meme_for_tomtom @@ -373,7 +404,7 @@ process tomtom { //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter = meme_for_filter.join( tsv_for_filter ) +for_filter = clustered_meme_for_filter.join( tsv_for_filter ) for_filter .filter { name, meme, tsv -> long count = tsv.readLines().size() @@ -403,6 +434,8 @@ process check_for_unknown_motifs { process get_best_motif { conda "${path_env}" + publishDir "${out}/esimated_motifs/unknown_motifs/", mode: 'copy' + input: set name, file(meme), file(tsv) from meme_for_scan From ff52b92265fb232c95a05eca11b3cd5dc4823e03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:56:04 -0500 Subject: [PATCH 113/158] fixed yml-file --- masterenv.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/masterenv.yml b/masterenv.yml index a94a8cf..211f4e1 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -1,8 +1,5 @@ name: masterenv -channels: - - bioconda - - conda-forge dependencies: - r-seqinr - numpy From 9ff4281061d88be0ac627daddd59b824a1e94027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 12 Dec 2018 17:56:56 -0500 Subject: [PATCH 114/158] minor changes --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 47dc610..e35ca57 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ wd = "/mnt/agnerds/Rene.Wiegandt/10_Master/masterJLU2018" createTimeout = 60 -params.threads=1 //Parameter for for scripts! Not for nextflow processes. +params.threads=60 //Parameter for for scripts! Not for nextflow processes. params.config="${wd}/config/uropa.config" env { From 725d7c051cb9102034dd9b6a09bcb2319bace305 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 12 Dec 2018 23:59:03 +0100 Subject: [PATCH 115/158] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e6b69e9..278cd05 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Required arguments: Optional arguments: - --tfbs_path Path to directory with output BED-files from tfbsscan. If given tfbsscan will not be run. + --tfbs_path Path to directory with output BED-files from tfbsscan. If given tfbsscan will not be run. Footprint extraction: --window_length INT (Default: 200) a length of a window @@ -87,11 +87,11 @@ Optional arguments: Motif clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) - Creating GTF: - --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) - --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) + Creating GTF: + --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) + --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) All arguments can be set in the configuration files. ``` From 5275f1802b430f7ebcad8ce76129c11a9ec084aa Mon Sep 17 00:00:00 2001 From: anastasiia Date: Thu, 13 Dec 2018 17:36:35 +0100 Subject: [PATCH 116/158] we do not need this env --- bin/call_peaks.yaml | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 bin/call_peaks.yaml diff --git a/bin/call_peaks.yaml b/bin/call_peaks.yaml deleted file mode 100644 index 4d3eabd..0000000 --- a/bin/call_peaks.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: call_peaks - -channels: - - bioconda - - conda-forge - -dependencies: - - numpy - - pybigwig \ No newline at end of file From 1c5f769cadff9679de286eac70f06f63838fdcb8 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 13 Dec 2018 21:09:05 +0100 Subject: [PATCH 117/158] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 278cd05..fe310d7 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ conda env create --name masterenv -f=$path ``` When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. +**Important Note:** For conda the channel bioconda needs to be set as highest priority! This required due two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! + ## Quick Start ```console nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] From 7d05d052e3c5f6ed65ca413d85d431c4491f4d0a Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 13 Dec 2018 21:14:55 +0100 Subject: [PATCH 118/158] Update merge.R --- bin/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/merge.R b/bin/merge.R index c2015aa..8ee50e3 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -33,5 +33,5 @@ loss = formatC(1 - suma/sumb, digits=2) lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) -write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') +write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From dc2363efeb01ce2da5559bd729d84e8c09a56507 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 13 Dec 2018 22:08:10 +0100 Subject: [PATCH 119/158] check prameters; minor fixes --- pipeline.nf | 71 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 18b6745..6ad4814 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -46,7 +46,7 @@ params.tfbs_path="" //cluster motifs params.edge_weight = 5 // Minimum weight of edges in motif-cluster-graph - motif_similarity_thresh = 0.00001 // threshold for motif similarity score + params.motif_similarity_thresh = 0.00001 // threshold for motif similarity score params.best_motif = 3 // Top n motifs per cluster @@ -119,6 +119,56 @@ All arguments can be set in the configuration files. Channel.fromPath(params.config).set {config} } +/* +Checking for parameter input! +*/ +int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", + "aprox_motif_len", "motif_occurence", "min_seq_length", "global", + "sequence_coverage", "memory", "throw_away_seq", "strand", + "min_seq", "motif_min_key", "motif_max_key", "iteration", + "edge_weight", "best_motif"] +req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] + +valid_organism = ["hg38", "hg19", "mm9", "mm10"] + +params.each { key, value -> + if(int_params.contains(key)) { + if (!("${value}" ==~ /\d+/ )){ + println("ERROR: $key needs to be an Integer") + System.exit(2) + } + } + if(req_params.contains(key)) { + if(!file(value).exists()) { + println("ERROR: $key not found. Please check the given path.") + System.exit(2) + } + } +} +if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ + println("ERROR: --identity needs to be float in range 0.8 to 1.0") + System.exit(2) +} + +if (!valid_organism.contains(params.organism)) { + println("ERROR: Invalid organism! Valid organisms: " + valid_organism) + System.exit(2) +} +if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){ + println("ERROR: --percentage needs to be an Integer between 0 and 100") + System.exit(2) +} + +if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { + println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01") + System.exit(2) +} +if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { + println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01") + System.exit(2) +} + + path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin bigwig_input.combine(bed_input).set{footprint_in} @@ -144,7 +194,7 @@ process footprint_extraction { """ } -//Abfrage ob ausgeführt werden muss. + /* */ @@ -160,7 +210,6 @@ process extract_known_TFBS { output: val ('done') into known_TFBS_for_overlap - //file ('*.bed') into known_TFBS_for_overlap when: params.tfbs_path == "" @@ -171,7 +220,9 @@ process extract_known_TFBS { """ } - +/* +Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known_TFBS is skipped, so the paths are differnt. +*/ if(params.tfbs_path == "") { bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} motif_path = params.tmpdir @@ -195,7 +246,6 @@ process overlap_with_known_TFBS { output: set name, file ("${name}.bed") into bed_for_reducing - set name, file ("${name}.bed") into testetst script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") @@ -203,12 +253,13 @@ process overlap_with_known_TFBS { ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} """ } -testetst.println() + /* */ process reduce_bed { conda "${path_env}" + echo true input: set name, file (bed) from bed_for_reducing @@ -227,6 +278,7 @@ process reduce_bed { */ process clustering { conda "${path_env}" + echo true publishDir "${out}/cluster/", mode: 'copy', pattern: '*.bed' @@ -321,7 +373,6 @@ Output table has the information which clusters are similar to each other. */ process find_similar_motifs { - input: file (merged_meme) from merged_meme @@ -369,8 +420,8 @@ process clustered_glam2 { output: set name, file ('*.meme') into clustered_meme_for_tomtom - set name, file ('*.meme') into clustered_meme_for_filter - file('*') + set name, file ('*.meme') into clustered_meme_for_filter + file('*') script: name = fasta.getBaseName() @@ -387,7 +438,7 @@ Tomtom searches motifs in databases. process tomtom { tag{name} - publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy' + publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy' input: set name, file (meme) from clustered_meme_for_tomtom From 1f45269bc54cc12a05396ade4f66a05a8865bc38 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 13 Dec 2018 23:24:18 +0100 Subject: [PATCH 120/158] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fe310d7..2452e7d 100644 --- a/README.md +++ b/README.md @@ -82,13 +82,14 @@ Optional arguments: --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: + --min_seq INT Minimum number of sequences required in the FASTA-files for GLAM2 (Default: 100) --motif_min_key INT Maximum number of key positions (aligned columns) (Default: 8) --motif_max_key INT Maximum number of key positions (aligned columns) (Default: 20) --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Motif clustering: - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 50) --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) Creating GTF: From b69682a933713c4246e7c82b19a67db0b834aa54 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 13 Dec 2018 23:25:03 +0100 Subject: [PATCH 121/158] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2452e7d..69e9661 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ conda env create --name masterenv -f=$path ``` When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. -**Important Note:** For conda the channel bioconda needs to be set as highest priority! This required due two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! +**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! ## Quick Start ```console From ee039515335cc5607ca89df267fa9e660103aaf5 Mon Sep 17 00:00:00 2001 From: basti Date: Fri, 14 Dec 2018 11:23:43 +0100 Subject: [PATCH 122/158] Fixed bug in formatting (gff vs gtf). Converted feature description to lowercase. --- bin/Modules/Ensembl/GTFGen.py | 8 +++++--- bin/Modules/SaveResults.py | 7 ++----- bin/Modules/ucsc/ucsc.py | 12 +++++++++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index 9753d39..ca57aa8 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -51,7 +51,7 @@ def reformat_to_gff(self, activity, release): # Add RegBuild_ + release templist.append("RegBuild_"+release) # Add Description from Description in last ; separated segment - templist.append(splitted_additional[4].split("=")[1]) + templist.append(splitted_additional[4].split("=")[1].lower()) # Add Start / End Data from original templist.extend(splitted[3:5]) # Add Score, Strand and Frame Data @@ -69,10 +69,12 @@ def reformat_to_gff(self, activity, release): def generate_additional_information(gene_id, activity): if gene_id.startswith("ID=regulatory_region:"): - gene_id = "ID="+gene_id.split(":")[1] + gene_id = 'ID "'+gene_id.split(':')[1]+'"' + + activity_string = 'activity "'+', '.join(activity)+'"' # helper method to concat activity information to string - return "; ".join([gene_id, "activity="+", ".join(activity)]) + return gene_id+'; '+activity_string def generate_activity_list(self, activity, index): # generates activity list diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index d285c25..66c2d97 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -1,10 +1,8 @@ -import csv import os class ResultSaver: - """ class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False. @@ -26,7 +24,6 @@ def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): else: self.path = os.path.join(out+"/"+organism+".gtf") - with open(self.path, "w") as file: - write_it = csv.writer(file, delimiter='\t') + with open(self.path, "w") as output_file: for line in results: - write_it.writerow(line) + output_file.write("\t".join(line)+"\n") diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index cd55976..a554a4d 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -41,13 +41,13 @@ def read_gff_to_gtf(self): sequence = [] sequence.append(row[0]) sequence.append("UCSC") - sequence.append(row[3]) + sequence.append(row[3].lower()) sequence.append(row[1]) sequence.append(row[2]) sequence.append(".") sequence.append(row[5]) sequence.append(".") - sequence.append("; ".join([self.find_ID("".join(row[11:])), ", ".join(self.get_activity(".".join(row[11:])))])) + sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])) gtf_lines.append(sequence) return gtf_lines @@ -55,8 +55,14 @@ def read_gff_to_gtf(self): def find_ID(self, line): # Find RefSeq ID in Line pattern = re.compile(r'ID:[0-9]{,9}|$') + ref_id = re.search(pattern, line).group() + splitted = ref_id.split(":") + if len(splitted) == 2: + returnstring = str(splitted[0])+' "'+str(splitted[1])+'"' + else: + returnstring = 'ID '+'"NA"' - return re.search( pattern, line).group() + return returnstring def get_activity(self, line): # Find activity categories in bed file From 7aaf90c7ed9be0c9c654da61a4ed5895eb4ca1e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 14 Dec 2018 06:34:48 -0500 Subject: [PATCH 123/158] fixes --- README.md | 1 + bin/compareBed.sh | 12 +++--- bin/merge.R | 3 +- config/cluster.config | 2 +- config/motif_estimation.config | 5 ++- pipeline.nf | 79 +++++++++++++++++++++++----------- 6 files changed, 66 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index e6b69e9..1e4c4be 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ Optional arguments: --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Motif clustering: + --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index f635283..426f3b4 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -148,7 +148,7 @@ then # mkdir workdir1337 #fi wo=true - workdir="." + workdir=$path fi if [ $ou == false ] @@ -223,7 +223,7 @@ echo calc maxScore Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs -echo "also get subsequences with no overlap of overlapping sequences" +#echo "also get subsequences with no overlap of overlapping sequences" help=true if [ -d $"motifs" ] @@ -265,11 +265,13 @@ fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition # also creates a small output file with information about the comparison -Rscript --vanilla $path/merge.R $min $max "$workdir" $data - +Rscript --vanilla $path/merge.R $min $max $workdir $data +echo "------------------------------------------------------------" +echo $workdir +head -100 $workdir/merged.bed #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up diff --git a/bin/merge.R b/bin/merge.R index c2015aa..0d349b5 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -22,7 +22,7 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') before = fread(args[4], header=FALSE) @@ -34,4 +34,3 @@ lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') - diff --git a/config/cluster.config b/config/cluster.config index 1b444e2..1ba0b33 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -9,7 +9,7 @@ params{ global=0 identity=0.8 sequence_coverage=8 - memory=800 + memory=2000 throw_away_seq=9 strand=0 } diff --git a/config/motif_estimation.config b/config/motif_estimation.config index d12e13c..6d9712a 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,6 +1,6 @@ params { //bed_to_clustered_fasta - min_seq = 10 + min_seq = 50 //glam2 motif_min_len = 8 @@ -11,7 +11,8 @@ params { tomtom_treshold = 0.01 //motif Clustering - edge_weight = 5 + cluster_motif = 0 + edge_weight = 50 motif_similarity_thresh = 0.00001 best_motif = 3 } diff --git a/pipeline.nf b/pipeline.nf index 18b6745..4631803 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -34,7 +34,7 @@ params.tfbs_path="" //motif_estimation //bed_to_clustered_fasta - params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 + params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 //glam2 params.motif_min_key = 8 // Minimum number of key positions (aligned columns) @@ -45,7 +45,8 @@ params.tfbs_path="" params.tomtom_treshold = 0.01 // threshold for similarity score. //cluster motifs - params.edge_weight = 5 // Minimum weight of edges in motif-cluster-graph + params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not + params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph motif_similarity_thresh = 0.00001 // threshold for motif similarity score params.best_motif = 3 // Top n motifs per cluster @@ -129,8 +130,8 @@ process footprint_extraction { conda "${path_env}" tag{name} - publishDir "${out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' - publishDir "${out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -152,7 +153,7 @@ process extract_known_TFBS { conda "${path_env}" - publishDir "${out}/known_TFBS/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' input: file (fasta) from fa_overlap @@ -187,28 +188,30 @@ if(params.tfbs_path == "") { */ process overlap_with_known_TFBS { conda "${path_env}" + echo true - publishDir "${out}/unknown_overlap/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/unknown_overlap/" input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: - set name, file ("${name}.bed") into bed_for_reducing - set name, file ("${name}.bed") into testetst + set name, file ("*.bed") into bed_for_reducing + //set name, file ("${name}_un.bed") into testetst script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_un.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} """ } -testetst.println() +//testetst.println() /* */ process reduce_bed { conda "${path_env}" + echo true input: set name, file (bed) from bed_for_reducing @@ -227,8 +230,9 @@ process reduce_bed { */ process clustering { conda "${path_env}" + echo true - publishDir "${out}/cluster/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed' input: set name, file (bed) from bed_for_clustering @@ -254,9 +258,6 @@ process bed_to_clustered_fasta { input: set name, file (bed) from bed_for_motif_esitmation - when: - params.fasta == false - output: file ('*.FASTA') into fasta_for_glam2 file ('*.FASTA') into fasta_for_motif_cluster @@ -280,16 +281,19 @@ Generating Motifs through alignment and scoring best local matches. process glam2 { tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' input: set name, file (fasta) from fasta_for_glam2 output: file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter script: """ - glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } @@ -299,12 +303,17 @@ The paths are sorted numerically depending on the cluster number. */ process merge_meme { + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + input: val (memelist) from meme_to_merge.toList() output: file ('merged_meme.meme') into merged_meme + when: + params.cluster_motif == 1 + script: memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } @@ -321,13 +330,16 @@ Output table has the information which clusters are similar to each other. */ process find_similar_motifs { - + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' input: file (merged_meme) from merged_meme output: file ('all_to_all.tsv') into motif_similarity + when: + params.cluster_motif == 1 + script: """ tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv @@ -340,17 +352,21 @@ files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) process merge_fasta { conda "${path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta output: file ('*.fasta') into motif_clustered_fasta_list - file('*.png') + file('*.png') + + when: + params.cluster_motif == 1 script: - fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } - fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") """ Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} """ @@ -362,7 +378,7 @@ motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() process clustered_glam2 { - publishDir "${out}/esimated_motifs/${name}/", mode: 'copy' + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' input: file (fasta) from motif_clustered_fasta_flat @@ -372,13 +388,24 @@ process clustered_glam2 { set name, file ('*.meme') into clustered_meme_for_filter file('*') + when: + params.cluster_motif == 1 + script: - name = fasta.getBaseName() + name = fasta.getBaseName() """ glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} """ } +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + /* Running Tomtom on meme-files generated by GLAM2. @@ -387,10 +414,10 @@ Tomtom searches motifs in databases. process tomtom { tag{name} - publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy' + publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy' input: - set name, file (meme) from clustered_meme_for_tomtom + set name, file (meme) from for_tomtom output: set name, file ('*.tsv') into tsv_for_filter @@ -404,8 +431,8 @@ process tomtom { //Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter = clustered_meme_for_filter.join( tsv_for_filter ) -for_filter +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 .filter { name, meme, tsv -> long count = tsv.readLines().size() count <= 1 @@ -434,7 +461,7 @@ process check_for_unknown_motifs { process get_best_motif { conda "${path_env}" - publishDir "${out}/esimated_motifs/unknown_motifs/", mode: 'copy' + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' input: set name, file(meme), file(tsv) from meme_for_scan From 0805573964ba2a9c44185acba4ed45817b7d7eb1 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 14 Dec 2018 13:33:56 +0100 Subject: [PATCH 124/158] standalone version --- compareBed.sh | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++ maxScore.R | 10 ++ merge.R | 60 +++++++++++ 3 files changed, 340 insertions(+) create mode 100644 compareBed.sh create mode 100644 maxScore.R create mode 100644 merge.R diff --git a/compareBed.sh b/compareBed.sh new file mode 100644 index 0000000..abd4c74 --- /dev/null +++ b/compareBed.sh @@ -0,0 +1,270 @@ +#!/bin/bash +#data +#motifs +#workdir +#fasta +#min +#max +#output +wrong=() +da=false +mo=false +wo=false +fa=false +mi=false +ma=false +ou=false +he=false + +script_dir=`echo $0 | sed 's/\/[^\/]*$/\//g'` + +if [ $# -eq 0 ] +then + he=true +fi + +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + -d|--data) + data="$2" + da=true + shift + shift + ;; + -m|--motifs) + motifs="$2" + mo=true + shift + shift + ;; + -w|--workdir) + workdir="$2" + wo=true + shift + shift + ;; + -f|--fasta) + fasta="$2" + fa=true + shift + shift + ;; + -min|--min) + min=$2 + mi=true + shift + shift + ;; + -max|--max) + max=$2 + ma=true + shift + shift + ;; + -o|--output) + output="$2" + ou=true + shift + shift + ;; + -h|--help) + help=true + he=true + shift + ;; + *) + wrong+=("$1") + shift + ;; +esac +done + +count=${#wrong[@]} +if [ $count -gt 0 ] +then + for i in ${wrong[@]} + do + echo wrong parameter $i + echo call script without parameters for help or call --help + echo exit + done +exit 1 +fi + +if [ $he == true ] +then + echo "This script utilies bedtools to select new footprints from data." + echo "Therefore the data is compared with known footprint motifs." + echo "The output is a new .bed file with added sequence information." + echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided or an error will occur." + echo "--------------------" + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." + echo "--------------------" + echo " required parameter:" + echo " -d --data the path to the .bed file of the footprints" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" + echo " -f --fasta the path to the .fasta file of genome" + echo " " + echo " optional parameter:" + echo " -w --workdir the path to directory where temporary files will be created" + echo " default is a new directory that is created in the current directory" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" + echo " -h --help shows this help message" +exit 0 +fi + +echo selected parameters +echo ------------------- +echo data: $da \(required\) +echo motifs: $mo \(required\) +echo workdir: $wo +echo fasta: $fa \(required\) +echo min: $mi +echo max: $ma +echo output: $ou +echo help: $he + +if [ $da == false ] || [ $mo == false ] || [ $fa == false ] +then + echo required parameters not given. + echo required are: --data \ --motifs \ --fasta \ + exit 1 +fi + +if [ $wo == false ] +then + #if [ ! -d workdir1337 ] + #then + # mkdir workdir1337 + #fi + wo=true + workdir="." +fi + +if [ $ou == false ] +then + output=${workdir}/"newMotifs.bed" + ou=true +fi + +if [ $mi == false ] +then + min=10 + mi=true +fi + +if [ $ma == false ] +then + max=80 + ma=true +fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + +#1. first filter. no overlap vs. overlap +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true + +if [ -d "$motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi + +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l + +echo calc maxScore +#2. compute absolut maxscore position +Rscript --vanilla ${script_dir}maxScore.R "$workdir"/pass1Fa.bed + +#3 subtract overlapping regions for additional motifs +echo "also get subsequences with no overlap of overlapping sequences" +help=true + +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + + +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +Rscript --vanilla ${script_dir}merge.R $min $max "$workdir" $data + +#5. add fasta sequences to bed and create fasta file +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta + +#6 clean up +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/maxScore.R b/maxScore.R new file mode 100644 index 0000000..7970d82 --- /dev/null +++ b/maxScore.R @@ -0,0 +1,10 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +library(data.table) +args = commandArgs(TRUE) +file = args[1] + +tab = fread(file) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +tab$maxpos = tab$start + tab$maxpos + +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R new file mode 100644 index 0000000..cfa5f03 --- /dev/null +++ b/merge.R @@ -0,0 +1,60 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#library(data.table) +#args=commandArgs(TRUE) +#min=as.numeric(args[1]) +#max=as.numeric(args[2]) +#folder=args[3] +#splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +#colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +#p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) +#colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +#p1$maxpos = p1$start + p1$maxpos + +#splitted=rbind(splitted, p1) + +#splitted=splitted[which(splitted$stop - splitted$start >= min),] +#splitted=splitted[which(splitted$stop - splitted$start <= max),] +#splitted$id=make.unique(as.character(splitted$id)) +#splitted$length=splitted$stop - splitted$start + +#splitted=cbind(splitted, containsMaxpos=0) +#splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 +#splitted$maxpos = splitted$maxpos - splitted$start +#fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') + +#!/bin/Rscript +library(data.table) +args=commandArgs(TRUE) +min=as.numeric(args[1]) +max=as.numeric(args[2]) +folder=args[3] + +splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") + +p1$maxpos = p1$start + p1$maxpos + +splitted=rbind(splitted, p1) + +splitted=splitted[which(splitted$stop - splitted$start >= min),] +splitted=splitted[which(splitted$stop - splitted$start <= max),] +splitted$id=make.unique(as.character(splitted$id)) +splitted$length=splitted$stop - splitted$start + +splitted=cbind(splitted, containsMaxpos=0) +splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 +splitted$maxpos = splitted$maxpos - splitted$start +data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') + +before = fread(args[4], header=FALSE) + +sumb=sum(before$V3-before$V2) +suma=sum(splitted$length) +difference = formatC(sumb/suma, digits=4) +loss = formatC(1 - suma/sumb, digits=2) +lengthb = formatC(mean(before$V3-before$V2), digits=4) +lengtha = formatC(mean(splitted$length), digits=4) +stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) +write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From 6ef5dcbaf40bd4ffe482d4384d036cc19707195b Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 14 Dec 2018 13:37:07 +0100 Subject: [PATCH 125/158] Delete compareBed.sh --- compareBed.sh | 270 -------------------------------------------------- 1 file changed, 270 deletions(-) delete mode 100644 compareBed.sh diff --git a/compareBed.sh b/compareBed.sh deleted file mode 100644 index abd4c74..0000000 --- a/compareBed.sh +++ /dev/null @@ -1,270 +0,0 @@ -#!/bin/bash -#data -#motifs -#workdir -#fasta -#min -#max -#output -wrong=() -da=false -mo=false -wo=false -fa=false -mi=false -ma=false -ou=false -he=false - -script_dir=`echo $0 | sed 's/\/[^\/]*$/\//g'` - -if [ $# -eq 0 ] -then - he=true -fi - -while [[ $# -gt 0 ]] -do -key="$1" - -case $key in - -d|--data) - data="$2" - da=true - shift - shift - ;; - -m|--motifs) - motifs="$2" - mo=true - shift - shift - ;; - -w|--workdir) - workdir="$2" - wo=true - shift - shift - ;; - -f|--fasta) - fasta="$2" - fa=true - shift - shift - ;; - -min|--min) - min=$2 - mi=true - shift - shift - ;; - -max|--max) - max=$2 - ma=true - shift - shift - ;; - -o|--output) - output="$2" - ou=true - shift - shift - ;; - -h|--help) - help=true - he=true - shift - ;; - *) - wrong+=("$1") - shift - ;; -esac -done - -count=${#wrong[@]} -if [ $count -gt 0 ] -then - for i in ${wrong[@]} - do - echo wrong parameter $i - echo call script without parameters for help or call --help - echo exit - done -exit 1 -fi - -if [ $he == true ] -then - echo "This script utilies bedtools to select new footprints from data." - echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information." - echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." - echo "If a parameter is chosen, a value must be provided or an error will occur." - echo "--------------------" - echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." - echo "--------------------" - echo " required parameter:" - echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" - echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" - echo " -f --fasta the path to the .fasta file of genome" - echo " " - echo " optional parameter:" - echo " -w --workdir the path to directory where temporary files will be created" - echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" - echo " -h --help shows this help message" -exit 0 -fi - -echo selected parameters -echo ------------------- -echo data: $da \(required\) -echo motifs: $mo \(required\) -echo workdir: $wo -echo fasta: $fa \(required\) -echo min: $mi -echo max: $ma -echo output: $ou -echo help: $he - -if [ $da == false ] || [ $mo == false ] || [ $fa == false ] -then - echo required parameters not given. - echo required are: --data \ --motifs \ --fasta \ - exit 1 -fi - -if [ $wo == false ] -then - #if [ ! -d workdir1337 ] - #then - # mkdir workdir1337 - #fi - wo=true - workdir="." -fi - -if [ $ou == false ] -then - output=${workdir}/"newMotifs.bed" - ou=true -fi - -if [ $mi == false ] -then - min=10 - mi=true -fi - -if [ $ma == false ] -then - max=80 - ma=true -fi - -if [ ! -d $workdir ] -then - mkdir $workdir -fi - -#1. first filter. no overlap vs. overlap -echo get sequences with no overlap -cat $data > "$workdir"/pass1Tr.bed -help=true - -if [ -d "$motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -fi - -if [ $help == false ] -then - cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed -fi -echo get overlapping sequences -bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed -cat "$workdir"/pass1Fa.bed | wc -l - -echo calc maxScore -#2. compute absolut maxscore position -Rscript --vanilla ${script_dir}maxScore.R "$workdir"/pass1Fa.bed - -#3 subtract overlapping regions for additional motifs -echo "also get subsequences with no overlap of overlapping sequences" -help=true - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -fi - - -if [ $help == false ] -then - cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed -else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed -fi - -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla ${script_dir}merge.R $min $max "$workdir" $data - -#5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" -bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta - -#6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file From 91e1dcd9aefe98f2bd727d156b46d0b2ae26b4a7 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 14 Dec 2018 13:37:15 +0100 Subject: [PATCH 126/158] Delete maxScore.R --- maxScore.R | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 maxScore.R diff --git a/maxScore.R b/maxScore.R deleted file mode 100644 index 7970d82..0000000 --- a/maxScore.R +++ /dev/null @@ -1,10 +0,0 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -tab$maxpos = tab$start + tab$maxpos - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 6588cdddb08f4fe72a8501188120a20e4ee96c76 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 14 Dec 2018 13:37:21 +0100 Subject: [PATCH 127/158] Delete merge.R --- merge.R | 60 --------------------------------------------------------- 1 file changed, 60 deletions(-) delete mode 100644 merge.R diff --git a/merge.R b/merge.R deleted file mode 100644 index cfa5f03..0000000 --- a/merge.R +++ /dev/null @@ -1,60 +0,0 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript -#library(data.table) -#args=commandArgs(TRUE) -#min=as.numeric(args[1]) -#max=as.numeric(args[2]) -#folder=args[3] -#splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -#colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -#p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) -#colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -#p1$maxpos = p1$start + p1$maxpos - -#splitted=rbind(splitted, p1) - -#splitted=splitted[which(splitted$stop - splitted$start >= min),] -#splitted=splitted[which(splitted$stop - splitted$start <= max),] -#splitted$id=make.unique(as.character(splitted$id)) -#splitted$length=splitted$stop - splitted$start - -#splitted=cbind(splitted, containsMaxpos=0) -#splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 -#splitted$maxpos = splitted$maxpos - splitted$start -#fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - -#!/bin/Rscript -library(data.table) -args=commandArgs(TRUE) -min=as.numeric(args[1]) -max=as.numeric(args[2]) -folder=args[3] - -splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") -p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") - -p1$maxpos = p1$start + p1$maxpos - -splitted=rbind(splitted, p1) - -splitted=splitted[which(splitted$stop - splitted$start >= min),] -splitted=splitted[which(splitted$stop - splitted$start <= max),] -splitted$id=make.unique(as.character(splitted$id)) -splitted$length=splitted$stop - splitted$start - -splitted=cbind(splitted, containsMaxpos=0) -splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 -splitted$maxpos = splitted$maxpos - splitted$start -data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - -before = fread(args[4], header=FALSE) - -sumb=sum(before$V3-before$V2) -suma=sum(splitted$length) -difference = formatC(sumb/suma, digits=4) -loss = formatC(1 - suma/sumb, digits=2) -lengthb = formatC(mean(before$V3-before$V2), digits=4) -lengtha = formatC(mean(splitted$length), digits=4) -stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) -write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From 63548bd80dd310cc6264f23287df48ec36fd353e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 14 Dec 2018 08:05:14 -0500 Subject: [PATCH 128/158] fix --- bin/compareBed.sh | 3 --- config/motif_estimation.config | 2 +- nextflow.config | 2 +- pipeline.nf | 4 ++-- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index 426f3b4..97e4a82 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -267,9 +267,6 @@ fi Rscript --vanilla $path/merge.R $min $max $workdir $data -echo "------------------------------------------------------------" -echo $workdir -head -100 $workdir/merged.bed #5. add fasta sequences to bed and create fasta file bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta diff --git a/config/motif_estimation.config b/config/motif_estimation.config index 6d9712a..2445d12 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,6 +1,6 @@ params { //bed_to_clustered_fasta - min_seq = 50 + min_seq = 10 //glam2 motif_min_len = 8 diff --git a/nextflow.config b/nextflow.config index e35ca57..969a9be 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,5 @@ wd = "/mnt/agnerds/Rene.Wiegandt/10_Master/masterJLU2018" -createTimeout = 60 +createTimeout = 40 params.threads=60 //Parameter for for scripts! Not for nextflow processes. params.config="${wd}/config/uropa.config" diff --git a/pipeline.nf b/pipeline.nf index a947549..b0608be 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -247,12 +247,12 @@ process overlap_with_known_TFBS { set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: - set name, file ("${name}.bed") into bed_for_reducing + set name, file ("${name}_unknown.bed") into bed_for_reducing script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_un.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} """ } From 0d9892a8c98d11d211377e67ced9df9d078f9977 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Fri, 14 Dec 2018 14:09:36 +0100 Subject: [PATCH 129/158] removed echo true from process --- pipeline.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index b0608be..a078df6 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -239,7 +239,6 @@ if(params.tfbs_path == "") { */ process overlap_with_known_TFBS { conda "${path_env}" - echo true publishDir "${params.out}/unknown_overlap/" From 651363865bc2040e35f53c2d311c67e679c7b67f Mon Sep 17 00:00:00 2001 From: SebastianBeyvers Date: Fri, 14 Dec 2018 14:29:17 +0100 Subject: [PATCH 130/158] Readme formatted --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 131b443..a884632 100644 --- a/README.md +++ b/README.md @@ -89,13 +89,13 @@ Optional arguments: --tomtom_treshold float Threshold for similarity score. (Default: 0.01) Motif clustering: - --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0) + --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) + --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) - Creating GTF: - --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) - --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) + Creating GTF: + --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) + --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) All arguments can be set in the configuration files. ``` From 8a306b1567104f055f38c714467fc86565efeb08 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 14 Dec 2018 14:44:51 +0100 Subject: [PATCH 131/158] $"motifs" falsch -> "$motifs" richtig :) --- bin/compareBed.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index 97e4a82..db58a2c 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -226,7 +226,7 @@ Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed #echo "also get subsequences with no overlap of overlapping sequences" help=true -if [ -d $"motifs" ] +if [ -d "$motifs" ] then for i in "$motifs"/*.bed do From fa5c83888efff98d305eec4d20982035db55bf37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 14 Dec 2018 08:59:37 -0500 Subject: [PATCH 132/158] minor fix --- pipeline.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index b0608be..ef2b0f5 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -239,7 +239,6 @@ if(params.tfbs_path == "") { */ process overlap_with_known_TFBS { conda "${path_env}" - echo true publishDir "${params.out}/unknown_overlap/" @@ -302,7 +301,7 @@ Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { conda "${path_env}" - + publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' tag{name} input: @@ -400,6 +399,10 @@ process find_similar_motifs { files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + +/* +Merging FASTA-files of similar clusters +*/ process merge_fasta { conda "${path_env}" publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' @@ -474,7 +477,7 @@ process tomtom { script: """ - tomtom ${meme} ${jaspar_db} -thresh ${params.thresh} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + tomtom ${meme} ${params.motif_db} -thresh ${params.thresh} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv """ } From 8669b3e802d95e4eb879689f26d076392f4c2a0c Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Fri, 14 Dec 2018 15:14:44 +0100 Subject: [PATCH 133/158] added info for FilterMotifs.stats --- bin/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/merge.R b/bin/merge.R index 0d349b5..129e41d 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -32,5 +32,5 @@ difference = formatC(sumb/suma, digits=4) loss = formatC(1 - suma/sumb, digits=2) lengthb = formatC(mean(before$V3-before$V2), digits=4) lengtha = formatC(mean(splitted$length), digits=4) -stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha) +stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') From e54d5359c767ef457107e60bb8b2e90c28e4b970 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Fri, 14 Dec 2018 16:52:52 +0100 Subject: [PATCH 134/158] Update pipeline.nf --- pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index ef2b0f5..c2f3511 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -55,7 +55,7 @@ params.tfbs_path="" params.organism="hg38" params.tissue="" - params.tmpdir = "/mnt/workspace1/rene.wiegandt/tmp/" + params.tmpdir = "./" if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ From 906942e1bc02c9faa421fb7e48d01932fb8aa7df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 14 Dec 2018 18:21:58 -0500 Subject: [PATCH 135/158] major bugfix --- bin/bed_to_fasta.R | 8 ++++---- pipeline.nf | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 13bd026..6cefe43 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -9,17 +9,17 @@ args = commandArgs(trailingOnly = TRUE) bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] +prefix <- args[2] +min_seq <- args[3] bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") -clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) print(nrow(clust)) if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz + sequences <- as.list(clust[[10]]) # <---- Splate mit Sequenz outfile <- paste0(prefix,"_cluster_",i,".FASTA") seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name } else { diff --git a/pipeline.nf b/pipeline.nf index ef2b0f5..c20c277 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -55,7 +55,7 @@ params.tfbs_path="" params.organism="hg38" params.tissue="" - params.tmpdir = "/mnt/workspace1/rene.wiegandt/tmp/" + params.tmpdir = "./" if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ @@ -240,7 +240,7 @@ if(params.tfbs_path == "") { process overlap_with_known_TFBS { conda "${path_env}" - publishDir "${params.out}/unknown_overlap/" + publishDir "${params.out}/unknown_overlap/", mode :'copy' input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap @@ -339,6 +339,7 @@ process glam2 { file("${name}/*.meme") into meme_to_merge set name, file("${name}/*.meme") into meme_for_tomtom set name, file("${name}/*.meme") into meme_for_filter + file ('*') script: """ @@ -477,7 +478,7 @@ process tomtom { script: """ - tomtom ${meme} ${params.motif_db} -thresh ${params.thresh} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv """ } From 1e3b771493d084fd651025d16c845316a8deac2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Fri, 14 Dec 2018 18:38:57 -0500 Subject: [PATCH 136/158] minor changes --- pipeline.nf | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index c20c277..5cd3c8c 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -8,6 +8,8 @@ params.motif_db="" params.config="" params.tfbs_path="" +params.create_known_tfbs_path = "./" + //peak_calling params.window_length = 200 params.step = 100 @@ -55,8 +57,6 @@ params.tfbs_path="" params.organism="hg38" params.tissue="" - params.tmpdir = "./" - if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] @@ -65,7 +65,10 @@ Required arguments: --bigwig Path to BigWig-file --bed Path to BED-file --genome_fasta Path to genome in FASTA-format - --jaspar_db Path to motif-database in MEME-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') Optional arguments: @@ -101,13 +104,14 @@ Optional arguments: --motif_max_len INT Maximum length of Motif (Default: 20) --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) Moitf clustering: --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [homo_sapiens | mus_musculus] + --organism [hg38 | hg19 | mm9 | mm10] --tissues All arguments can be set in the configuration files. @@ -217,7 +221,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.tmpdir} + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} """ } @@ -226,7 +230,7 @@ Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known */ if(params.tfbs_path == "") { bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} - motif_path = params.tmpdir + motif_path = params.create_known_tfbs_path } else { Channel.from('skipped').set {workaround} bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap} From fafe237bca92cc32b20beabd6253c74f15aa81d7 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:41:07 +0100 Subject: [PATCH 137/158] Update README.md --- README.md | 77 ++++++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index a884632..38b2d10 100644 --- a/README.md +++ b/README.md @@ -47,57 +47,48 @@ nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --bigwig Path to BigWig-file with scores on the peaks of interest - --bed Path to BED-file with peaks of interest corresponding to the BigWig file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - - + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') Optional arguments: - - --tfbs_path Path to directory with output BED-files from tfbsscan. If given tfbsscan will not be run. - + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. Footprint extraction: - --window_length INT (Default: 200) a length of a window - --step INT (Default: 100) an interval to slide the window - --percentage INT(Default: 0) a percentage to be added to background while searching for footprints - + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT (Default: 0) Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) - - Cluster: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + Clustering: Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) - --aprox_motif_len INT Motif length (Default: 10) - --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. - --min_seq_length INT Remove all sequences below this value. (Default: 10) - + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) - --identity FLOAT Identity threshold. (Default: 0.8) - --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) - --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: - --min_seq INT Minimum number of sequences required in the FASTA-files for GLAM2 (Default: 100) - --motif_min_key INT Maximum number of key positions (aligned columns) (Default: 8) - --motif_max_key INT Maximum number of key positions (aligned columns) (Default: 20) + --motif_min_len INT Minimum length of Motif (Default: 8) + --motif_max_len INT Maximum length of Motif (Default: 20) --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) - - Motif clustering: - --cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0) - --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) - --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) - - Creating GTF: - --tissue STRING Filter for one or more tissue/category activity, categories as in JSON config (Default: None) - --organism STRING Source organism: [ hg19 | hg38 or mm9 | mm10 ] (Default: hg38) - -All arguments can be set in the configuration files. + --best_motif INT Get the best X motifs per cluster. (Default: 3) + Moitf clustering: + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + Creating GTF: + --organism [hg38 | hg19 | mm9 | mm10] + --tissues +All arguments can be set in the configuration files ``` From aed0e60532a1e35b0d7f61f11b40e6c70a62d4b6 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:43:37 +0100 Subject: [PATCH 138/158] Update README.md --- README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 38b2d10..0a64a9f 100644 --- a/README.md +++ b/README.md @@ -53,22 +53,27 @@ Required arguments: --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. - Path can be set as tfbs_path in next run. (Default: './') + Path can be set as tfbs_path in next run. (Default: './') Optional arguments: + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: --window_length INT (Default: 200) --step INT (Default: 100) --percentage INT (Default: 0) + Filter unknown motifs: --min_size_fp INT (Default: 10) --max_size_fp INT (Default: 100) + Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -76,15 +81,20 @@ Optional arguments: --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + Motif estimation: + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_len INT Minimum length of Motif (Default: 8) --motif_max_len INT Maximum length of Motif (Default: 20) - --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] --tissues From 3ec60362665d6219a36e0bc7c81b05c31f0853b3 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:45:49 +0100 Subject: [PATCH 139/158] Update README.md --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0a64a9f..fa7ed4b 100644 --- a/README.md +++ b/README.md @@ -47,11 +47,11 @@ nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` Required arguments: - --bigwig Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - --config Path to UROPA configuration file + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') Optional arguments: @@ -60,7 +60,7 @@ Optional arguments: Footprint extraction: --window_length INT (Default: 200) - --step INT (Default: 100) + --step INT (Default: 100) --percentage INT (Default: 0) Filter unknown motifs: @@ -69,23 +69,23 @@ Optional arguments: Clustering: Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) + --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) + --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) - --motif_min_len INT Minimum length of Motif (Default: 8) - --motif_max_len INT Maximum length of Motif (Default: 20) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) From 6f263b5bfba9569e6fc518d2656eef568fef4d72 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:46:16 +0100 Subject: [PATCH 140/158] Update pipeline.nf --- pipeline.nf | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 5cd3c8c..c208d4a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -62,59 +62,59 @@ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] Required arguments: - --bigwig Path to BigWig-file - --bed Path to BED-file - --genome_fasta Path to genome in FASTA-format - --motif_db Path to motif-database in MEME-format - --config Path to UROPA configuration file + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. - Path can be set as tfbs_path in next run. (Default: './') - - + Path can be set as tfbs_path in next run. (Default: './') Optional arguments: - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. - + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: --window_length INT (Default: 200) - --step INT (Default: 100) + --step INT (Default: 100) --percentage INT (Default: 0) - + Filter unknown motifs: --min_size_fp INT (Default: 10) --max_size_fp INT (Default: 100) - + Clustering: Sequence preparation/ reduction: - --kmer INT Kmer length (Default: 10) + --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) - + Clustering: - --global INT Global (=1) or local (=0) alignment. (Default: 0) + --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) - --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) - --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + Motif estimation: - --motif_min_len INT Minimum length of Motif (Default: 8) - --motif_max_len INT Maximum length of Motif (Default: 20) - --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) - + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] --tissues - -All arguments can be set in the configuration files. +All arguments can be set in the configuration files + ``` """ } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} From 13d4447189c87a532d8f56b401c051bd16573016 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:47:16 +0100 Subject: [PATCH 141/158] Update pipeline.nf --- pipeline.nf | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index c208d4a..c7b64e1 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,14 +1,13 @@ #!/usr/bin/env nextflow //setting default values -params.bigwig="" -params.bed="" -params.genome_fasta="" -params.motif_db="" -params.config="" -params.tfbs_path="" - -params.create_known_tfbs_path = "./" + params.bigwig="" + params.bed="" + params.genome_fasta="" + params.motif_db="" + params.config="" + params.tfbs_path="" + params.create_known_tfbs_path = "./" //peak_calling params.window_length = 200 @@ -20,19 +19,19 @@ params.create_known_tfbs_path = "./" params.max_size_fp=100 //clustering - //reduce_bed - params.kmer=10 - params.aprox_motif_len=10 - params.motif_occurence=1 - params.min_seq_length=10 - - //cdhit_wrapper - params.global=0 - params.identity=0.8 - params.sequence_coverage=8 - params.memory=800 - params.throw_away_seq=9 - params.strand=0 + //reduce_bed + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 //motif_estimation //bed_to_clustered_fasta From 565eeaa55639c0b8582caf441eb892e80cf9254c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:53:00 +0100 Subject: [PATCH 142/158] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fa7ed4b..3bf19c3 100644 --- a/README.md +++ b/README.md @@ -96,8 +96,9 @@ Optional arguments: --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [hg38 | hg19 | mm9 | mm10] - --tissues + --organism [hg38 | hg19 | mm9 | mm10] Input organism + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config All arguments can be set in the configuration files ``` From 43646c6fd84d5bc3f8920c1f399a53df94416d02 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:53:21 +0100 Subject: [PATCH 143/158] Update pipeline.nf --- pipeline.nf | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index c7b64e1..0855e37 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -58,8 +58,6 @@ if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ -Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] - Required arguments: --bigwig Path to BigWig-file --bed Path to BED-file @@ -70,7 +68,7 @@ Required arguments: Path can be set as tfbs_path in next run. (Default: './') Optional arguments: - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. Footprint extraction: --window_length INT (Default: 200) @@ -110,8 +108,9 @@ Optional arguments: --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [hg38 | hg19 | mm9 | mm10] - --tissues + --organism [hg38 | hg19 | mm9 | mm10] Input organism + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config All arguments can be set in the configuration files ``` """ From a390459b6bcf1fb13fe66cdca836502503f73ca4 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 00:54:40 +0100 Subject: [PATCH 144/158] Update pipeline.nf --- pipeline.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline.nf b/pipeline.nf index 0855e37..d4a1a11 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -58,6 +58,8 @@ if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ +Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] + Required arguments: --bigwig Path to BigWig-file --bed Path to BED-file From 4b2150c223de07c049a95e67c5078dcb976ca2a2 Mon Sep 17 00:00:00 2001 From: anastasiia Date: Sat, 15 Dec 2018 19:19:08 +0100 Subject: [PATCH 145/158] rename call_peaks to footrptins_extraction --- bin/{call_peaks.py => footprints_extraction.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{call_peaks.py => footprints_extraction.py} (100%) diff --git a/bin/call_peaks.py b/bin/footprints_extraction.py similarity index 100% rename from bin/call_peaks.py rename to bin/footprints_extraction.py From 62add26c4748185b88b349bce428732ccec3375f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sat, 15 Dec 2018 13:44:40 -0500 Subject: [PATCH 146/158] minor fixes --- bin/get_best_motif.py | 2 +- pipeline.nf | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py index 0e7334e..cc24949 100644 --- a/bin/get_best_motif.py +++ b/bin/get_best_motif.py @@ -8,7 +8,7 @@ def parse_arguments(): args = parser.parse_args() return args - +# write lines of file till certain line (MOTIF + [num]) def main(): args = parse_arguments() out = open(args.output, "w+") diff --git a/pipeline.nf b/pipeline.nf index d8d5ad5..e1977d2 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -59,7 +59,7 @@ if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] - + Required arguments: --bigwig Path to BigWig-file --bed Path to BED-file @@ -71,23 +71,23 @@ Required arguments: Optional arguments: --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. - + Footprint extraction: --window_length INT (Default: 200) --step INT (Default: 100) --percentage INT (Default: 0) - + Filter unknown motifs: --min_size_fp INT (Default: 10) --max_size_fp INT (Default: 100) - + Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) - + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -95,7 +95,7 @@ Optional arguments: --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) @@ -103,12 +103,12 @@ Optional arguments: --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) - + Moitf clustering: --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON @@ -195,7 +195,7 @@ process footprint_extraction { script: """ - python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} + python ${path_bin}/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} """ } @@ -265,6 +265,7 @@ process overlap_with_known_TFBS { process reduce_bed { conda "${path_env}" echo true + publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy' input: set name, file (bed) from bed_for_reducing @@ -404,7 +405,6 @@ process find_similar_motifs { files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) - /* Merging FASTA-files of similar clusters */ @@ -443,8 +443,8 @@ process clustered_glam2 { output: set name, file ('*.meme') into clustered_meme_for_tomtom - set name, file ('*.meme') into clustered_meme_for_filter - file('*') + set name, file ('*.meme') into clustered_meme_for_filter + file('*') when: params.cluster_motif == 1 @@ -472,7 +472,7 @@ Tomtom searches motifs in databases. process tomtom { tag{name} - publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy' + publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy' input: set name, file (meme) from for_tomtom From 6954ecd11635c1ffb84fae041944cd4da5eec91e Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 19:47:19 +0100 Subject: [PATCH 147/158] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3bf19c3..87be038 100644 --- a/README.md +++ b/README.md @@ -59,9 +59,9 @@ Optional arguments: --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT (Default: 0) + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) Filter unknown motifs: --min_size_fp INT (Default: 10) From 6c42389c831626713dcfc1eb0c351c052bd44ca3 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 19:49:05 +0100 Subject: [PATCH 148/158] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 87be038..a3c4947 100644 --- a/README.md +++ b/README.md @@ -64,8 +64,8 @@ Optional arguments: --percentage INT Threshold in percent (Default: 0) Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) Clustering: Sequence preparation/ reduction: From a159e4137773e7e2e0ebbafb0a4c93101136801c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 19:49:39 +0100 Subject: [PATCH 149/158] Update pipeline.nf --- pipeline.nf | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index e1977d2..3148d11 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -71,23 +71,23 @@ Required arguments: Optional arguments: --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. - + Footprint extraction: - --window_length INT (Default: 200) - --step INT (Default: 100) - --percentage INT (Default: 0) - + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + Filter unknown motifs: - --min_size_fp INT (Default: 10) - --max_size_fp INT (Default: 100) - + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) + Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) - + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -95,7 +95,7 @@ Optional arguments: --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) @@ -103,12 +103,12 @@ Optional arguments: --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) - + Moitf clustering: --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON From 550ad78e628049645499fe7ae0c15ad7eb60f07e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sat, 15 Dec 2018 16:59:30 -0500 Subject: [PATCH 150/158] added parameter help --- pipeline.nf | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 3148d11..ad4b199 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -8,6 +8,7 @@ params.config="" params.tfbs_path="" params.create_known_tfbs_path = "./" + params.help = 0 //peak_calling params.window_length = 200 @@ -56,7 +57,7 @@ params.organism="hg38" params.tissue="" -if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == ""){ +if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" == "1"){ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] @@ -71,23 +72,23 @@ Required arguments: Optional arguments: --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. - + Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) --percentage INT Threshold in percent (Default: 0) - + Filter unknown motifs: --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) - + Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) - + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -95,7 +96,7 @@ Optional arguments: --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) @@ -103,12 +104,12 @@ Optional arguments: --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) - + Moitf clustering: --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON @@ -116,6 +117,7 @@ Optional arguments: All arguments can be set in the configuration files ``` """ +System.exit(2) } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} Channel.fromPath(params.bed).set {bed_input} From 1bb8521dbf4e12a2d59402a87e1cf437afe691eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sat, 15 Dec 2018 17:00:22 -0500 Subject: [PATCH 151/158] added separate motif_estimation script --- bin/motif_estimation.nf | 283 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 bin/motif_estimation.nf diff --git a/bin/motif_estimation.nf b/bin/motif_estimation.nf new file mode 100644 index 0000000..f5b96e3 --- /dev/null +++ b/bin/motif_estimation.nf @@ -0,0 +1,283 @@ + +params.bed = "" +params.out = "" +params.motif_db = "" +params.path_env = "" +params.help = 0 +params.path_bin = "." + +//bed_to_clustered_fasta +params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 + +//glam2 +params.motif_min_key = 8 // Minimum number of key positions (aligned columns) +params.motif_max_key = 20 // Maximum number of key positions (aligned columns) +params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. + +//tomtom +params.tomtom_treshold = 0.01 // threshold for similarity score. + +//cluster motifs +params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not +params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph +motif_similarity_thresh = 0.00001 // threshold for motif similarity score + +if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") { +log.info """ +Usage: nextflow run pipeline.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file +--bed Path Input BED-file (Column with cluster: 11; column with sequenze: 10)!! +--out Path Output Path +--motif_db Path Path to MEME-Database +--path_env Path Path to YAML file for conda enviroment. +--path_bin Path Path to directory with subscripts +--help 0|1 Print help message if help == 1 (Default: 0) + +Motif estimation: +--min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) +--motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) +--motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) +--iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) +--tomtom_treshold float Threshold for similarity score. (Default: 0.01) +--best_motif INT Get the best X motifs per cluster. (Default: 3) + +Moitf clustering: +--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) +--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) +--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) +""" +System.exit(2) +} else { + Channel.fromPath(params.bed).map {it -> [it.simpleName, it]}.set {bed_for_motif_esitmation} +} + +/* +Converting BED-File to one FASTA-File per cluster +*/ +process bed_to_clustered_fasta { + conda "${params.path_env}" + publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' + tag{name} + + input: + set name, file (bed) from bed_for_motif_esitmation + + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster + + script: + """ + Rscript ${params.path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + """ +} + + +//flatten list and adding name of file to channel value +fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} +//combine fasta files in one list +fasta_for_motif_cluster = fasta_for_motif_cluster.toList() + +/* +Running GLAM2 on FASTA-files. +Generating Motifs through alignment and scoring best local matches. +*/ +process glam2 { + + tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' + + input: + set name, file (fasta) from fasta_for_glam2 + + output: + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter + file ('*') + + script: + """ + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +/* +Combining all MEME-files to one big MEME-file. +The paths are sorted numerically depending on the cluster number. +*/ +process merge_meme { + + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + + input: + val (memelist) from meme_to_merge.toList() + + output: + file ('merged_meme.meme') into merged_meme + + when: + params.cluster_motif == 1 + + script: + memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } + meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } + meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") + """ + meme2meme ${meme_list} > merged_meme.meme + """ +} + +/* +Running Tomtom on merged meme-files. +Output table has the information which clusters are similar to each other. +*/ +process find_similar_motifs { + + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' + input: + file (merged_meme) from merged_meme + + output: + file ('all_to_all.tsv') into motif_similarity + + when: + params.cluster_motif == 1 + + script: + """ + tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + """ +} + + +files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + + +/* +Merging FASTA-files of similar clusters +*/ +process merge_fasta { + conda "${params.path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' + + input: + set file (motiv_sim), val (fasta_list) from files_for_merge_fasta + + output: + file ('*.fasta') into motif_clustered_fasta_list + file('*.png') + + when: + params.cluster_motif == 1 + + script: + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + """ + Rscript ${params.path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + """ +} + + +motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() + + +process clustered_glam2 { + + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' + + input: + file (fasta) from motif_clustered_fasta_flat + + output: + set name, file ('*.meme') into clustered_meme_for_tomtom + set name, file ('*.meme') into clustered_meme_for_filter + file('*') + + when: + params.cluster_motif == 1 + + script: + name = fasta.getBaseName() + """ + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + + +/* +Running Tomtom on meme-files generated by GLAM2. +Tomtom searches motifs in databases. +*/ +process tomtom { + + tag{name} + publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy' + + input: + set name, file (meme) from for_tomtom + + output: + set name, file ('*.tsv') into tsv_for_filter + + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ +} + + +//Joining channels with meme and tsv files. Filter joined channel on line count. +//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 + .filter { name, meme, tsv -> + long count = tsv.readLines().size() + count <= 1 + } + .into { meme_for_scan; check } + + +//If channel 'check' is empty print errormessage +process check_for_unknown_motifs { + echo true + + input: + val x from check.ifEmpty('EMPTY') + + when: + x == 'EMPTY' + + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ + +} + + +//Get the best(first) Motif from each MEME-file +process get_best_motif { + conda "${params.path_env}" + + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' + + input: + set name, file(meme), file(tsv) from meme_for_scan + + output: + set name, file('*_best.meme') into best_motif + + script: + """ + python ${params.path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + """ +} From 76b07f929fe19375166e7288c325a24a4dafa97f Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 23:09:35 +0100 Subject: [PATCH 152/158] Update pipeline.nf --- pipeline.nf | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index ad4b199..71709f9 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -9,6 +9,7 @@ params.tfbs_path="" params.create_known_tfbs_path = "./" params.help = 0 + params.out = "./out/" //peak_calling params.window_length = 200 @@ -57,9 +58,9 @@ params.organism="hg38" params.tissue="" -if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" == "1"){ +if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ log.info """ -Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] +Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] Required arguments: --bigwig Path to BigWig-file @@ -69,9 +70,12 @@ Required arguments: --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') + --out Output Directory (Default: './out/') + Optional arguments: - - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + + --help [0|1] 1 to show this help message. (Default: 0) + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) From e0ec81bee9d9358d30bda5107eabe65ce75653a5 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 23:10:12 +0100 Subject: [PATCH 153/158] Update README.md --- README.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a3c4947..1aa01e6 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ When the enviroment is created, set the variable 'path_env' in the configuration ## Quick Start ```console -nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] +nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] ``` ## Parameters For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). @@ -54,26 +54,29 @@ Required arguments: --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') + --out Output Directory (Default: './out/') + Optional arguments: - - --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --help [0|1] 1 to show this help message. (Default: 0) + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) --percentage INT Threshold in percent (Default: 0) - + Filter unknown motifs: --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) - + Clustering: Sequence preparation/ reduction: --kmer INT Kmer length (Default: 10) --aprox_motif_len INT Motif length (Default: 10) --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. --min_seq_length Interations Remove all sequences below this value. (Default: 10) - + Clustering: --global INT Global (=1) or local (=0) alignment. (Default: 0) --identity FLOAT Identity threshold. (Default: 0.8) @@ -81,7 +84,7 @@ Optional arguments: --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) - + Motif estimation: --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) @@ -89,12 +92,12 @@ Optional arguments: --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) --tomtom_treshold float Threshold for similarity score. (Default: 0.01) --best_motif INT Get the best X motifs per cluster. (Default: 3) - + Moitf clustering: --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) - + Creating GTF: --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON From 50fb433424cc63f14539cca2d46fd37f1f7fee8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sat, 15 Dec 2018 17:14:31 -0500 Subject: [PATCH 154/158] documentation --- bin/bed_to_fasta.R | 6 +++--- bin/motif_estimation.nf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 6cefe43..84910f3 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -14,14 +14,14 @@ min_seq <- args[3] bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") -clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column discard <- lapply(1:length(clusters), function(i){ clust <- as.data.frame(clusters[i]) print(nrow(clust)) if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[10]]) # <---- Splate mit Sequenz + sequences <- as.list(clust[[10]]) # <---- sequenze column outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column } else { print(paste0("Cluster: ",i," is to small")) } diff --git a/bin/motif_estimation.nf b/bin/motif_estimation.nf index f5b96e3..430040e 100644 --- a/bin/motif_estimation.nf +++ b/bin/motif_estimation.nf @@ -24,7 +24,7 @@ motif_similarity_thresh = 0.00001 // threshold for motif similarity score if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") { log.info """ -Usage: nextflow run pipeline.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file +Usage: nextflow run motif_estimation.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file --bed Path Input BED-file (Column with cluster: 11; column with sequenze: 10)!! --out Path Output Path --motif_db Path Path to MEME-Database From c998452f74999ee34d8295dd289b9e09c0b415af Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Sat, 15 Dec 2018 23:44:59 +0100 Subject: [PATCH 155/158] Changes output path for create_gtf --- pipeline.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index 71709f9..fe031af 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -575,7 +575,7 @@ process cluster_quality { process create_GTF { conda "${path_env}" - publishDir 'Path', mode:'copy' + publishDir "${params.out}/gtf/", mode: 'copy' output: file ('*.gtf') into gtf_for_uropa From a2f15b396f660934cf08a1b75bf8474d70389c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sat, 15 Dec 2018 18:29:03 -0500 Subject: [PATCH 156/158] added complete info text --- pipeline.nf | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index fe031af..f5358a6 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -70,10 +70,10 @@ Required arguments: --config Path to UROPA configuration file --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. Path can be set as tfbs_path in next run. (Default: './') - --out Output Directory (Default: './out/') - + --out Output Directory (Default: './out/') + Optional arguments: - + --help [0|1] 1 to show this help message. (Default: 0) --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. @@ -634,3 +634,16 @@ process filter { """ """ } */ + +workflow.onComplete { +log.info""" +Pipeline execution summary +--------------------------- +Completed at: ${workflow.complete} +Duration : ${workflow.duration} +Success : ${workflow.success} +workDir : ${workflow.workDir} +exit status : ${workflow.exitStatus} +Error report: ${workflow.errorReport ?: '-'} +""" +} From e330274ec3230ba22a912d347eebfd12c67d7846 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Sun, 16 Dec 2018 08:32:17 -0500 Subject: [PATCH 157/158] bug fix --- bin/merge_similar_clusters.R | 12 ++++++++---- pipeline.nf | 10 +++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R index e72fa03..03b8cf1 100644 --- a/bin/merge_similar_clusters.R +++ b/bin/merge_similar_clusters.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript # Merging FASTA-files, which motifs are similar. -# +# # @parameter tsv_in Path to TSV file generated by Tomtom. # The input for Tomtom is a from all clusters merged meme-file. # @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths @@ -10,7 +10,7 @@ args = commandArgs(trailingOnly = TRUE) -tsv_in <- args[1] +tsv_in <- args[1] file_list <- args[2] min_weight <- args[3] @@ -38,7 +38,7 @@ edgelist <- sim_not_unique[query_cluster != target_cluster] g <- igraph::graph_from_edgelist(as.matrix(edgelist)) # converting graph to adjacency matrix adj_matrix <- igraph::get.adjacency(g, names = T) -# generating weighted graph from adjacency matrix +# generating weighted graph from adjacency matrix g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) # get subgraphs from graph with edges of weight > min_weight @@ -47,7 +47,11 @@ png('motif_clusters.png') plot(s1) dev.off() clust <- igraph::clusters(s1) - +if (clust$no < 1){ + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) +} # merge FASTA-files depending on the clustered graphs a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ cl <- as.vector(which(clust$membership %in% c(i))) diff --git a/pipeline.nf b/pipeline.nf index f5358a6..a39616a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -124,7 +124,7 @@ All arguments can be set in the configuration files System.exit(2) } else { Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} - Channel.fromPath(params.bed).set {bed_input} + Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} @@ -205,6 +205,7 @@ process footprint_extraction { """ } +for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan) /* @@ -216,8 +217,7 @@ process extract_known_TFBS { publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' input: - file (fasta) from fa_overlap - file (db) from db_for_motivscan + set file (fasta), file (db), file (bed) from for_tfbs output: val ('done') into known_TFBS_for_overlap @@ -227,7 +227,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} """ } @@ -417,7 +417,7 @@ Merging FASTA-files of similar clusters process merge_fasta { conda "${path_env}" publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' - + echo true input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta From e4256a921958bf01fbd1eff63e3be6c2737bab6b Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Mon, 17 Dec 2018 23:04:42 +0100 Subject: [PATCH 158/158] $workdir/merged.bed -> "$workdir"/merged.bed --- bin/compareBed.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/compareBed.sh b/bin/compareBed.sh index db58a2c..aa2c37e 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -268,7 +268,7 @@ fi Rscript --vanilla $path/merge.R $min $max $workdir $data #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > $output bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up