extend uropa config

loosolab · Dec 21, 2018 · 34a37a1 · 34a37a1
1 parent 2ebf8ad
commit 34a37a1
Showing 1 changed file with 32 additions and 35 deletions.
diff --git a/tobias/utils/peak_annotation.sh b/tobias/utils/peak_annotation.sh
@@ -14,21 +14,21 @@ usage() {
 	echo -e ""
 	echo -e "Usage: $0 [options]"
 	echo -e "Mandatory"
-	echo -e "	-i <string> []			input bed file for annotation"
-	echo -e "	-o <string> []			output bed file with path"
-	echo -e "	-g <string> []			gtf file for annotation, make sure filter attribute suits"
-
+	echo -e "	-i|--bed <string> []			input bed file for annotation"
+	echo -e "	-o|--annotation <string> []		peak annotation file"
+	echo -e "	-g|--gtf <string> []			gtf file for annotation, check filter attributes"
 	echo -e "Optional"
-	echo -e "	-p <string> [basename bed _header.txt]	header file"
-	echo -e "	-l <string> [basename bed .log]	log file"
-	echo -e "	-c <int> [1]			number of cores that should be used"
-	echo -e "	-t <string> [gene]		feature for annotation"
-	echo -e "	-r <string> [start]		feature anchor for distance calculation"
-	echo -e "	-e <string> [[10000,1000]]	distance to feature, either one distance or distance interval"
-	echo -e "	-a <string> [gene_biotype]	attribute for filtering, attention 'gene_biotype' in ensemble, but 'gene_type' in gencode"
-	echo -e "	-w <string> [protein_coding]	attribute value for filtering"
-	echo -e "	-n <string> [gene_name,gene_id, ]	show attributes for uropa output"
-	echo -e "	-v				print stdout only to log file"
+	echo -e "	-p|--header <string> [bed_header.txt]	peak annotation header file"
+	echo -e "	-l|--log <string> [input.log]		log file"
+	echo -e "	-c|--cores <int> [1]			number of cores"
+	#echo -e "	-j|--universe <string> [] gene universe, can be used for enrichment"
+	echo -e "For UROPA params check: https://uropa-manual.readthedocs.io/config.html"
+	echo -e "	-t|--feature <string> [gene]		UROPA param"
+	echo -e "	-r|--feature_anchor <string> [start]	UROPA param"
+	echo -e "	-e|--distance <string> [[10000,1000]]	UROPA param"
+	echo -e "	-a|--filter_attribute <string> []	UROPA param"
+	echo -e "	-w|--attribute_value <string> []	UROPA param"
+	echo -e "	-n|--show_attribute <string> [gene_name,gene_id,gene_biotype]	UROPA param"
 	exit 1
 }
 export LC_ALL=C
@@ -39,7 +39,7 @@ feature="gene"
 feature_anchor="start"
 distance=[10000,1000]
 #filter_attribute="gene_biotype"
-#value_attribute="protein_coding"
+#attribute_value="protein_coding"
 show_attribute='"gene_biotype", "gene_name", "gene_id"'
 
 # no option given
@@ -50,8 +50,8 @@ fi
 
 # Process command line options
 progname=${0##*/}
-shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:vh"
-longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:,verbose"
+shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:h"
+longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:"
 opts=$(getopt -s bash --options "$shortopts" --longoptions "$longopts" --name "$progname" -- "$@")
 
 eval set -- ${opts}
@@ -72,13 +72,10 @@ while true; do
         ( -a|--filter_attribute	) filter_attribute=$2; shift ;;
         ( -w|--attribute_value  ) attribute_value=$2; shift ;;
         ( -n|--show_attribute   ) show_attribute=$2; shift ;;
-        ( -v|--verbose          ) verbose=yes ;;
     esac
     shift
 done
-
-echo -e "feature: $feature \ndistance: $distance \nshow_attribute: $show_attribute"
-
+# process params
 if [[ -z $bed ]] || [[ -z $annotation ]] || [[ -z $gtf ]]; then
 	echo -e "bed file or gtf for annotation or output missing: mandatory!" 
 	usage
@@ -99,14 +96,17 @@ fi
 prefix="$(dirname $annotation)/$(basename $annotation .bed)"
 json="${prefix}.json"
 
-echo -e "Started peak annotation:" 2>&1 | tee -a $log
 
-if [[ ! -z $filter_attribute ]] && [[ ! -z $value_attribute ]]; then
+echo -e "Started peak annotation:" 2>&1 | tee -a $log
+# create json config for uropa annotation
+if [[ ! -z $filter_attribute ]] && [[ ! -z $attribute_value ]]; then
 	cat > $json << EOL
 {
 	"queries":[
-	{"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$value_attribute", "show.attributes":[$show_attribute]}
+	{"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$attribute_value", "show.attributes":[$show_attribute]},
+	{"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "show.attributes":[$show_attribute]}
 	],
+	"priority": "True",
 	"gtf": "$gtf",
 	"bed": "$bed"
 }
@@ -123,29 +123,26 @@ else
 EOL
 fi
 
-
 # uropa annotation
 echo -e "uropa -i $json -l $log -t $cores -p $prefix -d -s" 2>&1 | tee -a $log
 uropa -i $json -l $log -t $cores -p $prefix -d -s 2>&1 | tee -a $log
 
 
 # build peaks with annotation
-# tr '**' '\t' |
-echo -e "header"
-#echo -e "tf_chr\ntf_start\ntf_end\ntf_name\ntf_score\ntf_strand" > $header
+# create final header of all merged annotated
 attributes=$(($(head -1 ${prefix}_finalhits.txt | awk '{print NF}')-1))
-`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - >> $header`
+echo -e "attributes: $attributes"
+`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - > $header`
 `head -1 ${prefix}_finalhits.txt | cut -f15-${attributes} - | sed -e 's/\t/\n/g' >> $header`
-#echo -e "fp_score" >> $header
+# extract columns of interest
 `awk 'BEGIN { OFS = "\t" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' ${prefix}_finalhits.txt | sed -e 1d |  sed -e 's/\t\t/\t/g' > $annotation'.tmp'` 	
 cut -f15-${attributes} ${prefix}_finalhits.txt | sed -e 1d | paste -d"\t" $annotation'.tmp' - > $annotation
 sort -k1,1 -k2,2n $annotation > $annotation'.tmp'
 mv $annotation'.tmp' $annotation
-rm $annotation'.tmp'
-
 
-echo -e "universe"
-col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln |  grep "gene_id" | cut -f1 | tail -1)
-cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe
+# add in case of go enrichment analysis
+#echo -e "universe"
+#col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln |  grep "gene_id" | cut -f1 | tail -1)
+#cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe
 
 echo -e "Finished peak annotation:" 2>&1 | tee -a $log