diff --git a/tobias/utils/peak_annotation.sh b/tobias/utils/peak_annotation.sh index 2f528c5..8d07a0c 100644 --- a/tobias/utils/peak_annotation.sh +++ b/tobias/utils/peak_annotation.sh @@ -14,21 +14,21 @@ usage() { echo -e "" echo -e "Usage: $0 [options]" echo -e "Mandatory" - echo -e " -i [] input bed file for annotation" - echo -e " -o [] output bed file with path" - echo -e " -g [] gtf file for annotation, make sure filter attribute suits" - + echo -e " -i|--bed [] input bed file for annotation" + echo -e " -o|--annotation [] peak annotation file" + echo -e " -g|--gtf [] gtf file for annotation, check filter attributes" echo -e "Optional" - echo -e " -p [basename bed _header.txt] header file" - echo -e " -l [basename bed .log] log file" - echo -e " -c [1] number of cores that should be used" - echo -e " -t [gene] feature for annotation" - echo -e " -r [start] feature anchor for distance calculation" - echo -e " -e [[10000,1000]] distance to feature, either one distance or distance interval" - echo -e " -a [gene_biotype] attribute for filtering, attention 'gene_biotype' in ensemble, but 'gene_type' in gencode" - echo -e " -w [protein_coding] attribute value for filtering" - echo -e " -n [gene_name,gene_id, ] show attributes for uropa output" - echo -e " -v print stdout only to log file" + echo -e " -p|--header [bed_header.txt] peak annotation header file" + echo -e " -l|--log [input.log] log file" + echo -e " -c|--cores [1] number of cores" + #echo -e " -j|--universe [] gene universe, can be used for enrichment" + echo -e "For UROPA params check: https://uropa-manual.readthedocs.io/config.html" + echo -e " -t|--feature [gene] UROPA param" + echo -e " -r|--feature_anchor [start] UROPA param" + echo -e " -e|--distance [[10000,1000]] UROPA param" + echo -e " -a|--filter_attribute [] UROPA param" + echo -e " -w|--attribute_value [] UROPA param" + echo -e " -n|--show_attribute [gene_name,gene_id,gene_biotype] UROPA param" exit 1 } export LC_ALL=C @@ -39,7 +39,7 @@ feature="gene" feature_anchor="start" distance=[10000,1000] #filter_attribute="gene_biotype" -#value_attribute="protein_coding" +#attribute_value="protein_coding" show_attribute='"gene_biotype", "gene_name", "gene_id"' # no option given @@ -50,8 +50,8 @@ fi # Process command line options progname=${0##*/} -shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:vh" -longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:,verbose" +shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:h" +longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:" opts=$(getopt -s bash --options "$shortopts" --longoptions "$longopts" --name "$progname" -- "$@") eval set -- ${opts} @@ -72,13 +72,10 @@ while true; do ( -a|--filter_attribute ) filter_attribute=$2; shift ;; ( -w|--attribute_value ) attribute_value=$2; shift ;; ( -n|--show_attribute ) show_attribute=$2; shift ;; - ( -v|--verbose ) verbose=yes ;; esac shift done - -echo -e "feature: $feature \ndistance: $distance \nshow_attribute: $show_attribute" - +# process params if [[ -z $bed ]] || [[ -z $annotation ]] || [[ -z $gtf ]]; then echo -e "bed file or gtf for annotation or output missing: mandatory!" usage @@ -99,14 +96,17 @@ fi prefix="$(dirname $annotation)/$(basename $annotation .bed)" json="${prefix}.json" -echo -e "Started peak annotation:" 2>&1 | tee -a $log -if [[ ! -z $filter_attribute ]] && [[ ! -z $value_attribute ]]; then +echo -e "Started peak annotation:" 2>&1 | tee -a $log +# create json config for uropa annotation +if [[ ! -z $filter_attribute ]] && [[ ! -z $attribute_value ]]; then cat > $json << EOL { "queries":[ - {"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$value_attribute", "show.attributes":[$show_attribute]} + {"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$attribute_value", "show.attributes":[$show_attribute]}, + {"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "show.attributes":[$show_attribute]} ], + "priority": "True", "gtf": "$gtf", "bed": "$bed" } @@ -123,29 +123,26 @@ else EOL fi - # uropa annotation echo -e "uropa -i $json -l $log -t $cores -p $prefix -d -s" 2>&1 | tee -a $log uropa -i $json -l $log -t $cores -p $prefix -d -s 2>&1 | tee -a $log # build peaks with annotation -# tr '**' '\t' | -echo -e "header" -#echo -e "tf_chr\ntf_start\ntf_end\ntf_name\ntf_score\ntf_strand" > $header +# create final header of all merged annotated attributes=$(($(head -1 ${prefix}_finalhits.txt | awk '{print NF}')-1)) -`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - >> $header` +echo -e "attributes: $attributes" +`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - > $header` `head -1 ${prefix}_finalhits.txt | cut -f15-${attributes} - | sed -e 's/\t/\n/g' >> $header` -#echo -e "fp_score" >> $header +# extract columns of interest `awk 'BEGIN { OFS = "\t" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' ${prefix}_finalhits.txt | sed -e 1d | sed -e 's/\t\t/\t/g' > $annotation'.tmp'` cut -f15-${attributes} ${prefix}_finalhits.txt | sed -e 1d | paste -d"\t" $annotation'.tmp' - > $annotation sort -k1,1 -k2,2n $annotation > $annotation'.tmp' mv $annotation'.tmp' $annotation -rm $annotation'.tmp' - -echo -e "universe" -col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln | grep "gene_id" | cut -f1 | tail -1) -cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe +# add in case of go enrichment analysis +#echo -e "universe" +#col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln | grep "gene_id" | cut -f1 | tail -1) +#cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe echo -e "Finished peak annotation:" 2>&1 | tee -a $log