Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
extend uropa config
Browse files Browse the repository at this point in the history
  • Loading branch information
afust committed Dec 21, 2018
1 parent 2ebf8ad commit 34a37a1
Showing 1 changed file with 32 additions and 35 deletions.
67 changes: 32 additions & 35 deletions tobias/utils/peak_annotation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,21 @@ usage() {
echo -e ""
echo -e "Usage: $0 [options]"
echo -e "Mandatory"
echo -e " -i <string> [] input bed file for annotation"
echo -e " -o <string> [] output bed file with path"
echo -e " -g <string> [] gtf file for annotation, make sure filter attribute suits"

echo -e " -i|--bed <string> [] input bed file for annotation"
echo -e " -o|--annotation <string> [] peak annotation file"
echo -e " -g|--gtf <string> [] gtf file for annotation, check filter attributes"
echo -e "Optional"
echo -e " -p <string> [basename bed _header.txt] header file"
echo -e " -l <string> [basename bed .log] log file"
echo -e " -c <int> [1] number of cores that should be used"
echo -e " -t <string> [gene] feature for annotation"
echo -e " -r <string> [start] feature anchor for distance calculation"
echo -e " -e <string> [[10000,1000]] distance to feature, either one distance or distance interval"
echo -e " -a <string> [gene_biotype] attribute for filtering, attention 'gene_biotype' in ensemble, but 'gene_type' in gencode"
echo -e " -w <string> [protein_coding] attribute value for filtering"
echo -e " -n <string> [gene_name,gene_id, ] show attributes for uropa output"
echo -e " -v print stdout only to log file"
echo -e " -p|--header <string> [bed_header.txt] peak annotation header file"
echo -e " -l|--log <string> [input.log] log file"
echo -e " -c|--cores <int> [1] number of cores"
#echo -e " -j|--universe <string> [] gene universe, can be used for enrichment"
echo -e "For UROPA params check: https://uropa-manual.readthedocs.io/config.html"
echo -e " -t|--feature <string> [gene] UROPA param"
echo -e " -r|--feature_anchor <string> [start] UROPA param"
echo -e " -e|--distance <string> [[10000,1000]] UROPA param"
echo -e " -a|--filter_attribute <string> [] UROPA param"
echo -e " -w|--attribute_value <string> [] UROPA param"
echo -e " -n|--show_attribute <string> [gene_name,gene_id,gene_biotype] UROPA param"
exit 1
}
export LC_ALL=C
Expand All @@ -39,7 +39,7 @@ feature="gene"
feature_anchor="start"
distance=[10000,1000]
#filter_attribute="gene_biotype"
#value_attribute="protein_coding"
#attribute_value="protein_coding"
show_attribute='"gene_biotype", "gene_name", "gene_id"'

# no option given
Expand All @@ -50,8 +50,8 @@ fi

# Process command line options
progname=${0##*/}
shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:vh"
longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:,verbose"
shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:h"
longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:"
opts=$(getopt -s bash --options "$shortopts" --longoptions "$longopts" --name "$progname" -- "$@")

eval set -- ${opts}
Expand All @@ -72,13 +72,10 @@ while true; do
( -a|--filter_attribute ) filter_attribute=$2; shift ;;
( -w|--attribute_value ) attribute_value=$2; shift ;;
( -n|--show_attribute ) show_attribute=$2; shift ;;
( -v|--verbose ) verbose=yes ;;
esac
shift
done

echo -e "feature: $feature \ndistance: $distance \nshow_attribute: $show_attribute"

# process params
if [[ -z $bed ]] || [[ -z $annotation ]] || [[ -z $gtf ]]; then
echo -e "bed file or gtf for annotation or output missing: mandatory!"
usage
Expand All @@ -99,14 +96,17 @@ fi
prefix="$(dirname $annotation)/$(basename $annotation .bed)"
json="${prefix}.json"

echo -e "Started peak annotation:" 2>&1 | tee -a $log

if [[ ! -z $filter_attribute ]] && [[ ! -z $value_attribute ]]; then
echo -e "Started peak annotation:" 2>&1 | tee -a $log
# create json config for uropa annotation
if [[ ! -z $filter_attribute ]] && [[ ! -z $attribute_value ]]; then
cat > $json << EOL
{
"queries":[
{"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$value_attribute", "show.attributes":[$show_attribute]}
{"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$attribute_value", "show.attributes":[$show_attribute]},
{"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "show.attributes":[$show_attribute]}
],
"priority": "True",
"gtf": "$gtf",
"bed": "$bed"
}
Expand All @@ -123,29 +123,26 @@ else
EOL
fi


# uropa annotation
echo -e "uropa -i $json -l $log -t $cores -p $prefix -d -s" 2>&1 | tee -a $log
uropa -i $json -l $log -t $cores -p $prefix -d -s 2>&1 | tee -a $log


# build peaks with annotation
# tr '**' '\t' |
echo -e "header"
#echo -e "tf_chr\ntf_start\ntf_end\ntf_name\ntf_score\ntf_strand" > $header
# create final header of all merged annotated
attributes=$(($(head -1 ${prefix}_finalhits.txt | awk '{print NF}')-1))
`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - >> $header`
echo -e "attributes: $attributes"
`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - > $header`
`head -1 ${prefix}_finalhits.txt | cut -f15-${attributes} - | sed -e 's/\t/\n/g' >> $header`
#echo -e "fp_score" >> $header
# extract columns of interest
`awk 'BEGIN { OFS = "\t" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' ${prefix}_finalhits.txt | sed -e 1d | sed -e 's/\t\t/\t/g' > $annotation'.tmp'`
cut -f15-${attributes} ${prefix}_finalhits.txt | sed -e 1d | paste -d"\t" $annotation'.tmp' - > $annotation
sort -k1,1 -k2,2n $annotation > $annotation'.tmp'
mv $annotation'.tmp' $annotation
rm $annotation'.tmp'


echo -e "universe"
col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln | grep "gene_id" | cut -f1 | tail -1)
cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe
# add in case of go enrichment analysis
#echo -e "universe"
#col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln | grep "gene_id" | cut -f1 | tail -1)
#cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe

echo -e "Finished peak annotation:" 2>&1 | tee -a $log

0 comments on commit 34a37a1

Please sign in to comment.