Skip to content
Permalink
aa53535785
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 516 lines (466 sloc) 15.7 KB
#!/bin/bash
#
# author: Jens Preußner<jens.preussner@mpi-bn.mpg.de>
# version: 1.0.0
#
VERSION="1.0.0"
GLOBIFS=$IFS
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
WORKDIR=$(pwd)
DEPENDENCIES="Rscript bedtools comb-p"
#----------------------------------------------------
#
# PART 1: READ PARAMETERS AND VALIDATE THEM
#
#----------------------------------------------------
# tab-separated sample definition file
S=design.txt
# tab-separated sample definition file
C=SampleSheet.csv
# Q-value FDR cutoff
Q=0.05
# Regions in bed format
R=()
# Detection p-value
P=0.01
T=0.5
# Fix outliers
F=TRUE
# REMOVE BAD SAMPLES
M=FALSE
L=10.5
# Normalization method to use
N=fn
# Normalization options
D=TRUE
B=TRUE
# Quality control output
E=FALSE
# Compressed input
Z=compressed.gz
# Compressed output
O=output.tar.gz
# NUMBER OF ADDITIONAL IMAGES
I=25
# GENE SETS for GSEA
G=()
PVAL_COLS="7 8"
STEPSIZE=100
CHECK_Z=false
while getopts ":hvbdflm:q:s:c:r:p:t:n:z:o:e:i:g:" opt; do
case $opt in
s) S=$OPTARG
;;
c) C=$OPTARG
CHECK_Z=true
;;
q) Q=$OPTARG
;;
h) cat << EOL
admire - methylation analysis ($VERSION)
Usage: admire [options]
Available options:
-c Comma separated sample definition file (SampleSheet.csv)
-s Tab separated sample definition file (design.txt)
-z Compressed input of idat files (requires -c).
-e Create quality control report in PDF
-r Region file in bed format (regions.bed)
Use multiple -r parameters to calculate for multiple region files
-p Detection p-value to exclude probes prior to analysis (0.01)
-t Exclude probes where more than t% samples failed according to the detection p-value. (0.5)
-n Normalization method (fn|swan|noob|illumina|raw|quantile)
-b In case of functional normalization, skip noob background correction step
-d In case of noob or functional normalization, skip dye correction step
-f In case of quantile normalization, skip fixing outliers prior to analysis
-l In case of quantile normalization, label samples as bad if their median signals are below a given value (10.5)
-m In case of quantile normalization, remove bad samples
-q Q-value cutoff for multiple testing correction (0.05)
-i Render advanced plots for the best i regions (20)
-g Gene set file for enrichment analysis
Use multiple -g parameters to calculate enrichment across many gene sets
-o tar-gz compress output into file given
-h shows this help message.
-v shows version information.
Options -c and -s are mutually exclusive.
Dependencies: Rscript, bedtools, comb-p, python
EOL
exit 1
;;
r) R+=("$OPTARG")
;;
g) G+=("$OPTARG")
;;
n)
if [[ "$OPTARG" == "fn" || "$OPTARG" == "noob" || "$OPTARG" == "illumina" || "$OPTARG" == "raw" || "$OPTARG" == "swan" || "$OPTARG" == "quantile" ]]; then
N=$OPTARG
else
echo "Error: Unknown normalization method $OPTARG. Please choose from fn, swan, noob, illumina, raw or quantile."
exit 1
fi
;;
p) P=$OPTARG
;;
t) T=$OPTARG
;;
b) B=FALSE
;;
d) D=FALSE
;;
f) F=FALSE
;;
m) M=TRUE
;;
l) L=$OPTARG
;;
v) echo $VERSION
;;
z) Z=$OPTARG
;;
i) I=$OPTARG
;;
o) O=$OPTARG
;;
e) E=$OPTARG
;;
\?) echo "Invalid option -$OPTARG" >&2
;;
:) echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
shift $(expr $OPTIND - 1 )
echo "Step 1 of 9: Evaluating provided files..."
for d in $DEPENDENCIES; do
command -v $d >/dev/null 2>&1 || { echo "Dependency error: $d required, but it's not installed. Use admire -h to show help."; exit 1; }
done
if [[ ! -f $S && ! -f $C ]]; then
echo "Sample definition file $S is not a valid file. Use admire -h to show help."
exit 1
fi
if [[ "$CHECK_Z" == true ]]; then
if [[ ! -f $Z ]]; then
echo "Mandatory file missing: -z $Z (as in use with -c $C). Use admire -h to show help."
exit 1;
fi
if [[ "$C" != "SampleSheet.csv" ]]; then
cp $C SampleSheet.csv
fi
fi
for region in "${R[@]}"; do
if [[ ! -f $region ]]; then
echo "Region file $region is not a valid file. Use admire -h to show help."
exit 1
fi
done
#----------------------------------------------------
#
# PART 2: PROVIDE NECCESSARY ENVIRONMENT
#
#----------------------------------------------------
echo "Step 2 of 9: Providing neccessary environment..."
# SANITY CHECK FOR compressed input
if [[ -f $Z && -f $C ]]; then
CSVUSE=true
if [[ $Z == *.zip ]]; then
unzip $Z -d $WORKDIR >/dev/null 2>&1
fi
if [[ $Z == *.tar.gz || $Z == *.tgz ]]; then
tar -zvxf $Z -C $WORKDIR >/dev/null 2>&1
fi
if [[ $Z == *.tar.bz2 || $Z == *.tbz2 || $Z == *.tar.bzip2 ]]; then
tar -jxvf $Z -C $WORKDIR >/dev/null 2>&1
fi
fi
# SANITY CHECK FOR sample definition FILE
experimental_groups=()
if [[ $CSVUSE ]]; then
IFS=,
while read name well plate group pool id position _
do
# Check for a possible header - skip this line
[[ "$name" == "Sample_Name" ]] && continue
if [[ ! -f ${id}/${id}_${position}_Grn.idat || ! -f ${id}/${id}_${position}_Red.idat ]]; then
if [[ ! -f ${id}_${position}_Grn.idat || ! -f ${id}_${position}_Red.idat ]]; then
echo "One or more files are missing: ${id}/${id}_${position}_Grn.idat or ${id}/${id}_${position}_Red.idat"
exit 1
fi
fi
experimental_groups+=("$group")
done < $C
SKIP_DESIGN=true
IFS=$GLOBIFS
else
declare -A design
declare -A group
declare -A color
while read id file color group
do
# Check for a possible header - skip this line
[[ "$file" == "file" ]] && continue
# If the specified files exists, take it into account
if [[ -f $file ]]; then
# Fill an array with filename as key and sample group as value
# save for later lookup
design+=( ["$id-$color"]="$file" )
group+=( ["$id-$color"]="$group" )
color+=( ["$id-$color"]="$color" )
experimental_groups+=("$group")
else
echo "File $file from $S does not exist."
exit 1
fi
done < $S
fi
if [[ ! $SKIP_DESIGN ]]; then
slidecount=1
samplecount=1
# Samples are sorted to ensure that the channels files from the same sample are stored in the same slide
indices=( ${!design[@]} )
IFS=$'\n'
sorted_samples=( $(echo -e "${indices[@]/%/\n}" | sed -r -e 's/^ *//' -e '/^$/d' | sort) )
IFS=$GLOBIFS
# Start CSV file
cat > SampleSheet.csv_ << EOL
Sample_Name,Sample_Well,Sample_Plate,Sample_Group,Pool_ID,Sentrix_ID,Sentrix_Position
EOL
# Each sample is linked to its original file, in the correct slide folder
for sample in "${sorted_samples[@]}"; do
mkdir -p slide$slidecount
file=${design["$sample"]}
fn=$(basename $file})
color=${color["$sample"]}
if [[ "$file" = /* ]]; then
ln -f -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
else
ln -f -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
fi
cat >> SampleSheet.csv_ << EOL
${sample%%-*},,,${group["$sample"]},,slide${slidecount},${sample%%-*}
EOL
if [[ $(( $samplecount % 24 )) -eq 0 ]]; then
let "slidecount+=1"
fi
let "samplecount+=1"
#shift
done
uniq SampleSheet.csv_ > SampleSheet.csv
rm SampleSheet.csv_
fi
# Make a unique set of experimental groups
groups=$(tr ' ' '\n' <<< "${experimental_groups[@]}" | sort -u | tr '\n' ' ')
groups=($groups)
if [[ ${#groups[@]} -lt 2 ]]; then
echo "Less then two distinct sample groups found. Please provide a sample definition file with two or more sample groups."
exit 1
fi
# to wirte a proper config.R file for the pipeline to run
cat > config.R << EOL
source("$DIR/.Rprofile")
# GLOBAL DIRECTORIES
base = file.path("./")
results = file.path("results/")
# WORKING ENVIRONMENT BASICS
sampleInfo = read.450k.sheet(base = base, pattern="SampleSheet.csv")
# PARAMETERS
normalization="$N"
bg_noob=$B
dye_corr=$D
qc_file="$E"
det_p=$P
sample_threshold=$T
fixOutliers=$F
removeBadSamples=$M
badSampleCutoff=$L
# SAMPLE GROUPING
samples = list(
EOL
groupcount=0
for g in ${groups[@]}; do
if [[ "$groupcount" -gt 0 ]]; then
echo "," >> config.R
fi
echo "${g} = which(sampleInfo\$Sample_Group == \"${g}\")" >> config.R
let "groupcount+=1"
done
pairwises=$(bc <<< "$groupcount*($groupcount-1)/2")
cat >> config.R << EOL
)
analysis = vector(mode = "list", length = $pairwises)
EOL
t=$(eval echo {0..$pairwises}{0..$pairwises})
i=1
for p in $t; do
digits=($(echo $p | grep -o .))
if [[ "${digits[1]}" -gt "${digits[0]}" && ${groups[${digits[0]}]} && ${groups[${digits[1]}]} ]]; then
echo "analysis[[$i]] = c(\"${groups[${digits[0]}]}\", \"${groups[${digits[1]}]}\")" >> config.R
let "i+=1"
fi
done
#----------------------------------------------------
#
# PART 3: NORMALIZE AND ANALYSE SINGLE PROBES
#
#----------------------------------------------------
echo "Step 3 of 9: Normalizing and analysing single probes..."
# To speed up local usage, try to avoid normalization if possible
RESULTS=results
if [[ ! -d $RESULTS/$N ]]; then
cp $DIR/.Rprofile .Rprofile
Rscript $DIR/normalizeAndCreateTables.R >/dev/null 2>&1
mkdir -p $RESULTS
Rscript $DIR/singleCpGAnalysis.R >/dev/null 2>&1
fi
#----------------------------------------------------
#
# PART 4: COMBINE INTO REGIONS
#
#----------------------------------------------------
echo "Step 4 of 9: Combining probes into regions..."
COMBP=comb-p
mkdir -p $RESULTS/$N/$COMBP
for reg in "${R[@]}"; do
r=`basename $reg .bed`
if [[ -d $RESULTS/$N/$r ]]; then
continue
fi
mkdir -p $RESULTS/$N/$r
for b in $RESULTS/$N/*.bed; do
p=`basename $b .bed`
if [ ! -f $RESULTS/$N/$p.sorted.bed ]; then
cat $b | tr -d '\r' | bedtools sort -i - > $RESULTS/$N/$p.sorted.bed
fi
bedtools intersect -wa -a $reg -b $RESULTS/$N/$p.sorted.bed | bedtools sort -i - | uniq > $RESULTS/$N/$r/$r-limited.bed
for c in $PVAL_COLS; do
comb-p region_p -c $c -s $STEPSIZE -p $RESULTS/$N/$p.sorted.bed -r $RESULTS/$N/$r/$r-limited.bed 2> $RESULTS/$N/$COMBP/$r-$p-$c-pvals.error.bed | bedtools sort -i - > $RESULTS/$N/$COMBP/$r-$p-$c-pvals.bed
done
join -t $'\t' -j 4 $RESULTS/$N/$COMBP/$r-$p-*-pvals.bed | awk 'BEGIN{OFS="\t"}{print $2,$3,$4,$1,".",$6,$7,$8,$14,$15}' > $RESULTS/$N/$r/$p.bed
awk -v fdr="$Q" 'BEGIN{OFS="\t"}{if($8 <= fdr || $10 <= fdr){print $0}}' $RESULTS/$N/$r/$p.bed > $RESULTS/$N/$r/$p.passed_fdr_$Q.bed
done
done
#----------------------------------------------------
#
# PART 5: CREATE VISUALIZATIONS
#
#----------------------------------------------------
echo "Step 5 of 9: Creating visualizations..."
VISUALIZATION=visualization/data-tracks
set -- ${groups[@]}
for a; do
shift
for b; do
g=$a-$b
mkdir -p $VISUALIZATION/$g/
rm -rf $VISUALIZATION/$g/$g.prelim
touch $VISUALIZATION/$g/$g.prelim
for reg in "${R[@]}"; do
r=`basename $reg .bed`
bedtools intersect -wa -a $DIR/../data/ilmn12.hg19.bed -b $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed >> $VISUALIZATION/$g/$g.prelim
awk -v g="$g" -v n="$r" -v a="$a" -v b="$b" 'BEGIN{print "#track gffTags=on name=\"",n," (",g,")\" color=\"54,144,192\""}{if($8 < $10){higherin=a;pval=$7;qval=$8}else{higherin=b;pval=$9;qval=$10}; print $1,$2,$3,$4";higher-in="higherin";p-value="pval";q-value="qval";chromosome="$1";start="$2";end="$3,".",".";}' $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed > $VISUALIZATION/$g/significant.$r.igv.bed
done
bedtools sort -i $VISUALIZATION/$g/$g.prelim | uniq > $VISUALIZATION/$g/$g.targets
rm $VISUALIZATION/$g/$g.prelim
done
done
Rscript $DIR/assembleMvalues.R >/dev/null 2>&1
mkdir -p $VISUALIZATION/../annotation-tracks
cp $DIR/../data/ilmn12.hg19.sorted.igv.bed $VISUALIZATION/../annotation-tracks
for reg in "${R[@]}"; do
r=`basename $reg .bed`
bedtools sort -i $reg | awk -v r="$r" 'BEGIN{print "#track gffTags=on name=\"",r,"\" color=\"54,144,192\""}{print $0}' > $VISUALIZATION/../annotation-tracks/$r.igv.bed
done
#----------------------------------------------------
#
# PART 6: CREATE TABLES FOR EXCEL
#
#----------------------------------------------------
echo "Step 6 of 9: Providing tables for Excel..."
EXCEL=excel
mkdir -p $EXCEL
set -- ${groups[@]}
cat > additional_images.txt << EOL
file group region chr
EOL
for a; do
shift
for b; do
g=$a-$b
for reg in "${R[@]}"; do
r=`basename $reg .bed`
bedtools map -c 5 -o min,median,max,count -a $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed -b $RESULTS/$N/$g.pvals.sorted.bed > $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.abs.bed
awk -v g="$g" 'BEGIN{FS="\t";OFS=",";split(g,c,"-");print "Feature","Gene","GeneID","Type","Chr","Start","Stop","Strand","P-val "c[1],"Q-val "c[1],"P-val "c[2],"Q-val "c[2],"Higher in","with Q-val","Coverage","Min.abs.diff","Median.abs.diff","Max.abs.diff";}
{n = split($4,a,";");for(i=1; i<=n; i++){
split(a[i],b,"=");
if(b[1] == "ID"){ id = b[2];}
if(b[1] == "gene_id"){ gid = b[2];}
if(b[1] == "gene_type"){ type = b[2];}
if(b[1] == "gene_name"){ name = b[2];}
};
split(g,c,"-");
if($8 < $10){higherin=c[1];pval=$7;qval=$8}else{higherin=c[2];pval=$9;qval=$10};
print id,name,gid,type,$1,$2,$3,$6,$7,$8,$9,$10,higherin,qval,$14,$11,$12,$13;}' $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.abs.bed > $EXCEL/$g-$r.csv
# Begin preparation of additional image rendering
if [[ ! -s $WORKDIR/normalized/betaValues_fn.sorted.txt ]]; then
(head -n 1 $WORKDIR/normalized/betaValues_fn.txt && tail -n +2 $WORKDIR/normalized/betaValues_fn.txt | sort) > $WORKDIR/normalized/betaValues_fn.sorted.txt
fi
sort -t',' -k14,14n $EXCEL/$g-$r.csv | cut -d',' -f5,6,7 | sed -e 's/,/\t/g' | tail -n +2 | head -n $I > $VISUALIZATION/$g/$r-target.bed
split -l 1 $VISUALIZATION/$g/$r-target.bed $VISUALIZATION/$g/$r-target-
for f in $VISUALIZATION/$g/$r-target-*; do
bedtools intersect -wa -a $DIR/../data/ilmn12.hg19.bed -b $f | awk 'BEGIN{FS="\t";OFS="\t";}{split($4,a,";");split(a[1],b,"=");print b[2],$2}' | sort > $f.probes
cut -f1 $f.probes | grep -f - $WORKDIR/normalized/betaValues_fn.sorted.txt > $f.grep
cut -f2 $f.probes | paste - $f.grep > $f.txt
chr=$(cut -f1 $f | sort | uniq | sed -e 's/\n//g')
cat >> additional_images.txt << EOL
$f $g $r $chr
EOL
rm -rf $f $f.grep $f.probes
done
rm -rf $VISUALIZATION/$g/$r-target.bed
done
done
done
rm -rf $RESULTS/$N/*.sorted.bed
#----------------------------------------------------
#
# PART 7: CREATE ADDITIONAL IMAGES
#
#----------------------------------------------------
echo "Step 7 of 9: Rendering additional images..."
ln -s $DIR/meth_plot.R meth_plot.R
Rscript $DIR/renderAdditionalImages.R >/dev/null 2>&1
#----------------------------------------------------
#
# PART 8: WRAP-UP, PROVIDE RESULTS
#
#----------------------------------------------------
echo "Step 8 of 9: Calculating enrichment across gene sets..."
GSEA=geneset_enrichment
mkdir -p $GSEA
# build up gene sets as serialized data structure
cat ${G[@]} | awk 'BEGIN{FS="\t";OFS="";print "{"}{name=$1;genes="";for(i=3;i<=NF;i++){if(i<NF){genes=genes"\""$(i)"\","}else{genes=genes"\""$(i)"\""}};if(NR!=1){print ",\""name"\" : ["genes"]"}else{print "\""name"\" : ["genes"]"}}END{print "}"}' | tr -d '\n' > $GSEA/genesets.dict
set -- ${groups[@]}
for a; do
shift
for b; do
g=$a-$b
for reg in "${R[@]}"; do
r=`basename $reg .bed`
# build ranked list(s)
tail -n +2 $EXCEL/$g-$r.csv | grep "$a" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$a.ranks
tail -n +2 $EXCEL/$g-$r.csv | grep "$b" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$b.ranks
if [[ -s $GSEA/$g-$r-$a.ranks || -s $GSEA/$g-$r-$b.ranks ]]; then
python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$a
python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$b.ranks $GSEA/$g-$r-$b
fi
rm -rf $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$b.ranks
done
done
done
#----------------------------------------------------
#
# PART 8: WRAP-UP, PROVIDE RESULTS
#
#----------------------------------------------------
echo "Step 9 of 9: Wrapping up results..."
tar -zcvf $O visualization excel >/dev/null 2>&1
echo "Done. Your results are compressed into $O."