Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/bin/bash
# This script assumes that trf (tandem repeat finder) is installed and available in PATH
MAIN_DIR="$1"
MINSCORE=${2-"150"}
MIN_REGION_FRACTION=${3-"0.2"}
REGION_SAVE_PREFIX=${4-"TRF_results"}
# more TRF parameters
MATCH=2
MISMATCH=5
DELTA=7
PM=80
PI=10
MAXPERIOD=500
TMP_HG38_FILE="hg38_for_trf.fa"
TRF_OUTPUT="${TMP_HG38_FILE}.$MATCH.$MISMATCH.$DELTA.$PM.$PI.$MINSCORE.$MAXPERIOD.dat"
TR_CNT=0
WORKED_REGIONS=0
for region in "$MAIN_DIR"/chr*;
do
region_name=$(basename "$region")
start=$(cut -d_ -f2 <<< "$region_name")
end=$(cut -d_ -f3 <<< "$region_name")
region_len=$((end-start))
#echo "$start" "$end" "$region_len"
# isolate one reference sequence
sequence_file="$region/comparison_candidates.fa"
[[ ! -e $sequence_file ]] && continue
grep -A1 "hg38" "$sequence_file" > "$TMP_HG38_FILE"
# call trf, redirect all normal output to null
trf "$TMP_HG38_FILE" "$MATCH" "$MISMATCH" "$DELTA" "$PM" "$PI" "$MINSCORE" "$MAXPERIOD" -h &> /dev/null
# do analysis on found TRs
seq_len=$(grep -v "hg38" "$TMP_HG38_FILE" | wc -c)
conflict_start=$((seq_len / 2 - region_len / 2))
conflict_end=$((seq_len / 2 + region_len / 2))
N_MATCHING_TRS=$(
awk -v start=$conflict_start -v end=$conflict_end -v frac="$MIN_REGION_FRACTION" '{
if (NF == 15) {
if (! ($1 > end || $2 < start) && ($2-$1 > (end-start)*frac)) {
sum += 1
}
}
}
END { print sum }' "$TRF_OUTPUT"
)
if (( N_MATCHING_TRS > 0 ))
then
((TR_CNT++))
echo "$region_name" >> "${REGION_SAVE_PREFIX}_positives.txt"
else
echo "$region_name" >> "${REGION_SAVE_PREFIX}_negatives.txt"
fi
((WORKED_REGIONS++))
((WORKED_REGIONS % 100 == 0)) && echo "$WORKED_REGIONS regions done. $TR_CNT of them contain tandem repeats."
done
echo "Tandem repeats found in $TR_CNT of $WORKED_REGIONS analyzable conflicts."
rm "$TMP_HG38_FILE" "$TRF_OUTPUT"