-
Notifications
You must be signed in to change notification settings - Fork 0
updated script. Now the path of subscripts is automatically aquired. … #33
Changes from 8 commits
694291c
3ebb1b8
7d834a4
7de9a1b
1a49821
9b5b061
9ecba8a
58eff98
b1cab10
cca8ac4
f983d22
1134d74
37c69e6
6802d72
b787d14
becfeae
c55e8ff
4acc20f
90c8c05
1a35ce5
530627c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
#!/bin/bash | ||
#data | ||
#motifs | ||
#workdir | ||
#fasta | ||
#min | ||
#max | ||
#output | ||
|
||
# This script utilizes bedtools to gain non-overlapping sequence parts between bed-files | ||
# merge.R and maxScore.R are needed to be saved in the same directory than this to make it work | ||
# For more information read the wiki or run ./compareBed.sh without parameters | ||
# author Jannik Hamp | ||
# jannik.hamp@googlemail.com | ||
|
||
wrong=() | ||
da=false | ||
mo=false | ||
|
@@ -14,13 +14,16 @@ fa=false | |
mi=false | ||
ma=false | ||
ou=false | ||
pa=false | ||
he=false | ||
|
||
#no parameters chosen means help is going to be displayed | ||
if [ $# -eq 0 ] | ||
then | ||
he=true | ||
fi | ||
|
||
#parsing the parameters, if a parameter is chosen, a value must be provided aswell. No standalone parameters | ||
while [[ $# -gt 0 ]] | ||
HendrikSchultheis marked this conversation as resolved.
Show resolved
Hide resolved
|
||
do | ||
key="$1" | ||
|
@@ -70,6 +73,7 @@ case $key in | |
;; | ||
-p|--path) | ||
path="$2" | ||
pa=true | ||
shift | ||
shift | ||
;; | ||
|
@@ -85,6 +89,7 @@ case $key in | |
esac | ||
done | ||
|
||
# stores unknown selected parameters for error report | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nicer to write something like "check for unknown arguments". |
||
count=${#wrong[@]} | ||
if [ $count -gt 0 ] | ||
then | ||
|
@@ -97,6 +102,7 @@ then | |
exit 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing indentation |
||
fi | ||
|
||
# the help message | ||
if [ $he == true ] | ||
then | ||
echo "This script utilies bedtools to select new footprints from data." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. utilizes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. both in bed-format and the (no comma) |
||
|
@@ -117,12 +123,14 @@ then | |
echo " -w --workdir the path to directory where temporary files will be created" | ||
echo " default is the current directory" | ||
echo " -min --min minimum size of footprints; default is 10" | ||
echo " -max --max maximum size of footprints; default is 80" | ||
echo " -max --max maximum size of footprints; default is 200" | ||
echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" | ||
echo " -h --help shows this help message" | ||
echo " -p --path the path where the required scripts merge.R and maxScore.R are stored. Default: same path as this scripts path" | ||
exit 0 | ||
fi | ||
|
||
# prints which parameters have been selected | ||
echo selected parameters | ||
echo ------------------- | ||
echo data: $da \(required\) | ||
|
@@ -133,24 +141,24 @@ echo min: $mi | |
echo max: $ma | ||
echo output: $ou | ||
echo help: $he | ||
echo path of scripts: $pa | ||
|
||
#checks if the 3 required parameters have been selected | ||
if [ $da == false ] || [ $mo == false ] || [ $fa == false ] | ||
then | ||
echo required parameters not given. | ||
echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\> | ||
exit 1 | ||
fi | ||
|
||
#workdir is set to the current directory if it was not specified with a parameter | ||
if [ $wo == false ] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should set $workdir in the beginning to './' and overwrite it if necessary. This would save you this whole if statement + an additional $wo variable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking a bit further I think that most if not all of your if statements in this part are overcomplicated. You should set default values at the beginning of your script. This would enable you to discard most of the if statements and leave you only with the required parameters. These should be checked for their existence or in other words whether they were set (https://stackoverflow.com/questions/3601515/how-to-check-if-a-variable-is-set-in-bash). |
||
then | ||
#if [ ! -d workdir1337 ] | ||
#then | ||
# mkdir workdir1337 | ||
#fi | ||
wo=true | ||
workdir=$path | ||
workdir=$PWD | ||
fi | ||
|
||
#if output was not specified it will be at "$workdir"/newMotifs.bed | ||
if [ $ou == false ] | ||
then | ||
output=${workdir}/"newMotifs.bed" | ||
|
@@ -165,18 +173,25 @@ fi | |
|
||
if [ $ma == false ] | ||
then | ||
max=80 | ||
max=200 | ||
ma=true | ||
fi | ||
|
||
if [ ! -d $workdir ] | ||
#default path of the scripts merge.R and maxScore.R is the same path as compareBed.sh has | ||
# the `echo $0 | sed 's/\/[^\/]*$/\//g'` command extracts the path from $0, where the command itself is stored | ||
if [ $pa == false ] | ||
then | ||
mkdir $workdir | ||
path=`echo $0 | sed 's/\/[^\/]*$/\//g'` | ||
pa=true | ||
fi | ||
|
||
#1. first filter. no overlap vs. overlap | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Filter for sequences without overlap" would be a nicer comment. |
||
# This is done with the data of new footprints and each motif file subsequently. | ||
# The output of one iteration is the input data for the new footprints of the next iteration, in which a new motif is checked. | ||
echo get sequences with no overlap | ||
cat $data > "$workdir"/pass1Tr.bed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please rename 'pass1Tr.bed' ('pass1Trhelp.bed' aswell) to something more descriptive. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rename pass1Fa.bed, pass1FaHelp.bed, etc. |
||
# help variable is needed, because in bash I cannot write to the same file from which I am also reading the data. | ||
# Thus, there are 2 files which are alternating in each iteration | ||
help=true | ||
|
||
if [ -d "$motifs" ] | ||
|
@@ -193,6 +208,7 @@ do | |
fi | ||
echo $i | ||
done | ||
# if the -m parameter is not a directory, it is expected to be a comma separated list of files | ||
else | ||
declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) | ||
for i in ${motiffiles[@]} | ||
|
@@ -209,6 +225,7 @@ do | |
done | ||
fi | ||
|
||
# After the final iteration, the last output is written to pass1Tr.bed if it is not already there | ||
if [ $help == false ] | ||
then | ||
cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed | ||
|
@@ -255,6 +272,7 @@ do | |
done | ||
fi | ||
|
||
#The output of the last iteration of the subtract loop is written to pass2Tr.bed | ||
if [ $help == false ] | ||
then | ||
cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed | ||
|
@@ -268,8 +286,5 @@ fi | |
Rscript --vanilla $path/merge.R $min $max $workdir $data | ||
|
||
#5. add fasta sequences to bed and create fasta file | ||
bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > $output | ||
bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output | ||
bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta | ||
|
||
#6 clean up | ||
#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,16 @@ | ||
#!/bin/Rscript | ||
|
||
# The script is used with the script: compareBed.sh | ||
# This calculates the absolute maxpos values of a bed-file and overwrites them | ||
# author: Jannik Hamp | ||
# email: jannik.hamp@googlemail.com | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please write something about your input parameter. |
||
library(data.table) | ||
args = commandArgs(TRUE) | ||
file = args[1] | ||
|
||
tab = fread(file) | ||
colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") | ||
colnames(tab) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This script fails if your table has not exactly 9 columns! Please change it to be more robust. |
||
|
||
tab$maxpos = tab$start + tab$maxpos | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the data.frame way of combining columns. As you already use a data.table please try using the := function. |
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,50 @@ | ||
#!/bin/Rscript | ||
|
||
# The script is used in the script: compareBed.sh | ||
# This script merges the non-overlapping regions and the non-overlapping parts of overlapping regions | ||
# It also removes sequences that are smaller than the --min paramter or bigger than the --max parameter | ||
# Additionally, information of the comparison is written into an output file | ||
# author Jannik Hamp | ||
# email jannik.hamp@googlemail.com | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Describe input parameters and outputs. |
||
library(data.table) | ||
args=commandArgs(TRUE) | ||
min=as.numeric(args[1]) | ||
max=as.numeric(args[2]) | ||
folder=args[3] | ||
|
||
# reading the first dataframe: called splitted (contains splitted regions because of partial overlap) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a data.table. |
||
splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use file.path instead of paste. |
||
colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") | ||
colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not robust see comment above. |
||
|
||
# reading the second dataframe: called p1 (all sequences with zero overlap) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still a data.table. |
||
p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. paste -> file.path |
||
colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") | ||
colnames(p1) = c("chromosome", "start", "stop", "id", "score", "strand", "length", "maxpos", "info") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not robust. |
||
|
||
# calculates the absolute position of maximum signal of each footprint | ||
p1$maxpos = p1$start + p1$maxpos | ||
|
||
# the new combined dataframe is now called splitted | ||
splitted=rbind(splitted, p1) | ||
|
||
# only keep entries with a larger sequence than min and a smaller sequence than max | ||
splitted=splitted[which(splitted$stop - splitted$start >= min),] | ||
splitted=splitted[which(splitted$stop - splitted$start <= max),] | ||
|
||
# make the ids unique (because of duplicated ids of some footprints that got spliited in 2) | ||
splitted$id=make.unique(as.character(splitted$id)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make.unique only appends numbers to duplicates leaving the first (of the duplicates) as is but we want every duplicate to get a number. (I think that already was discussed at some point.) |
||
# calculate new length values (because of the splitted footprints) | ||
splitted$length=splitted$stop - splitted$start | ||
|
||
# add column containsMaxpos (0 means that maxpos has any overlap and 1 means that maxpos has no overlap with any motif) | ||
splitted=cbind(splitted, containsMaxpos=0) | ||
splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 | ||
|
||
#calculate relative maxpos values | ||
splitted$maxpos = splitted$maxpos - splitted$start | ||
data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you call |
||
|
||
#aditional information of the comparison of the unknown footprints and the known motifs are computed and written to FilterMotifs.stats | ||
before = fread(args[4], header=FALSE) | ||
|
||
sumb=sum(before$V3-before$V2) | ||
|
@@ -33,4 +54,4 @@ loss = formatC(1 - suma/sumb, digits=2) | |
lengthb = formatC(mean(before$V3-before$V2), digits=4) | ||
lengtha = formatC(mean(splitted$length), digits=4) | ||
stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) | ||
write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') | ||
write.table(stats, "./FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
...same directory as ...