admire

#!/bin/bash
#
# author: Jens Preußner<jens.preussner@mpi-bn.mpg.de>
# version: 1.0.0
#
VERSION="1.0.0"
GLOBIFS=$IFS
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
WORKDIR=$(pwd)
DEPENDENCIES="Rscript bedtools comb-p"

#----------------------------------------------------
#
# PART 1: READ PARAMETERS AND VALIDATE THEM
#
#----------------------------------------------------
# tab-separated sample definition file
S=design.txt

# tab-separated sample definition file
C=SampleSheet.csv

# Q-value FDR cutoff
Q=0.05

# Regions in bed format
R=()

# Detection p-value
P=0.01
T=0.5

# Fix outliers
F=TRUE

# REMOVE BAD SAMPLES
M=FALSE
L=10.5

# Normalization method to use
N=fn

# Normalization options
D=TRUE
B=TRUE

# Quality control output
E=FALSE

# Compressed input
Z=compressed.gz

# Compressed output
O=output.tar.gz

# NUMBER OF ADDITIONAL IMAGES
I=25

# GENE SETS for GSEA
G=()

PVAL_COLS="7 8"
STEPSIZE=100
CHECK_Z=false

while getopts ":hvbdflm:q:s:c:r:p:t:n:z:o:e:i:g:" opt; do
  case $opt in
    s) S=$OPTARG
    ;;
    c) C=$OPTARG
       CHECK_Z=true
    ;;
    q) Q=$OPTARG
    ;;
    h) cat << EOL
admire - methylation analysis ($VERSION)
Usage: admire [options]

Available options:
 -c	Comma separated sample definition file (SampleSheet.csv)
 -s	Tab separated sample definition file (design.txt)
 -z	Compressed input of idat files (requires -c).
 -e	Create quality control report in PDF
 -r	Region file in bed format (regions.bed)
	Use multiple -r parameters to calculate for multiple region files
 -p	Detection p-value to exclude probes prior to analysis (0.01)
 -t	Exclude probes where more than t% samples failed according to the detection p-value. (0.5)
 -n	Normalization method (fn|swan|noob|illumina|raw|quantile)
 -b	In case of functional normalization, skip noob background correction step
 -d	In case of noob or functional normalization, skip dye correction step
 -f	In case of quantile normalization, skip fixing outliers prior to analysis
 -l	In case of quantile normalization, label samples as bad if their median signals are below a given value (10.5)
 -m	In case of quantile normalization, remove bad samples
 -q	Q-value cutoff for multiple testing correction (0.05)
 -i	Render advanced plots for the best i regions (20)
 -g	Gene set file for enrichment analysis
	Use multiple -g parameters to calculate enrichment across many gene sets
 -o	tar-gz compress output into file given
 -h	shows this help message.
 -v	shows version information.

Options -c and -s are mutually exclusive.
Dependencies: Rscript, bedtools, comb-p, python
EOL
    exit 1
    ;;
    r) R+=("$OPTARG")
    ;;
    g) G+=("$OPTARG")
    ;;
    n)
if [[ "$OPTARG" == "fn" || "$OPTARG" == "noob" || "$OPTARG" == "illumina" || "$OPTARG" == "raw" || "$OPTARG" == "swan" || "$OPTARG" == "quantile" ]]; then
 N=$OPTARG
else
 echo "Error: Unknown normalization method $OPTARG. Please choose from fn, swan, noob, illumina, raw or quantile."
 exit 1
fi
    ;;
    p) P=$OPTARG
    ;;
    t) T=$OPTARG
    ;;
    b) B=FALSE
    ;;
    d) D=FALSE
    ;;
    f) F=FALSE
    ;;
    m) M=TRUE
    ;;
    l) L=$OPTARG
    ;;
    v) echo $VERSION
    ;;
    z) Z=$OPTARG
    ;;
    i) I=$OPTARG
    ;;
    o) O=$OPTARG
    ;;
    e) E=$OPTARG
    ;;
    \?) echo "Invalid option -$OPTARG" >&2
    ;;
    :) echo "Option -$OPTARG requires an argument." >&2
       exit 1
    ;;
  esac
done
shift $(expr $OPTIND - 1 )

echo "Step 1 of 9: Evaluating provided files..."

for d in $DEPENDENCIES; do
	command -v $d >/dev/null 2>&1 || { echo "Dependency error: $d required, but it's not installed. Use admire -h to show help."; exit 1; }
done

if [[ ! -f $S && ! -f $C ]]; then
        echo "Sample definition file $S is not a valid file. Use admire -h to show help."
        exit 1
fi

if [[ "$CHECK_Z" == true ]]; then
	if [[ ! -f $Z ]]; then
		echo "Mandatory file missing: -z $Z (as in use with -c $C). Use admire -h to show help."
		exit 1;
	fi
	if [[ "$C" != "SampleSheet.csv" ]]; then
		cp $C SampleSheet.csv
	fi
fi

for region in "${R[@]}"; do
	if [[ ! -f $region ]]; then
		echo "Region file $region is not a valid file. Use admire -h to show help."
        	exit 1
	fi
done
#----------------------------------------------------
#
# PART 2: PROVIDE NECCESSARY ENVIRONMENT
#
#----------------------------------------------------
echo "Step 2 of 9: Providing neccessary environment..."
# SANITY CHECK FOR compressed input
if [[ -f $Z && -f $C ]]; then
	CSVUSE=true
	if [[ $Z == *.zip ]]; then
		unzip $Z -d $WORKDIR >/dev/null 2>&1
	fi
	if [[ $Z == *.tar.gz || $Z == *.tgz ]]; then
		tar -zvxf $Z -C $WORKDIR >/dev/null 2>&1
	fi
	if [[ $Z == *.tar.bz2 || $Z == *.tbz2 || $Z == *.tar.bzip2 ]]; then
		tar -jxvf $Z -C $WORKDIR >/dev/null 2>&1
	fi
fi

# SANITY CHECK FOR sample definition FILE
experimental_groups=()
if [[ $CSVUSE ]]; then
	IFS=,
	while read name well plate group pool id position _
	do
		# Check for a possible header - skip this line
		[[ "$name" == "Sample_Name" ]] && continue
		if [[ ! -f ${id}/${id}_${position}_Grn.idat || ! -f ${id}/${id}_${position}_Red.idat ]]; then
			if [[ ! -f ${id}_${position}_Grn.idat || ! -f ${id}_${position}_Red.idat ]]; then
				echo "One or more files are missing: ${id}/${id}_${position}_Grn.idat or ${id}/${id}_${position}_Red.idat"
				exit 1
			fi
		fi
		experimental_groups+=("$group")
	done < $C
	SKIP_DESIGN=true
	IFS=$GLOBIFS
else
	declare -A design
	declare -A group
	declare -A color
	while read id file color group
	do
		# Check for a possible header - skip this line
		[[ "$file" == "file" ]] && continue
		# If the specified files exists, take it into account
		if [[ -f $file ]]; then
			# Fill an array with filename as key and sample group as value
			# save for later lookup
			design+=( ["$id-$color"]="$file" )
			group+=( ["$id-$color"]="$group" )
			color+=( ["$id-$color"]="$color" )
			experimental_groups+=("$group")
		else
			echo "File $file from $S does not exist."
			exit 1
		fi
	done < $S
fi

if [[ ! $SKIP_DESIGN ]]; then
	slidecount=1
	samplecount=1

	# Samples are sorted to ensure that the channels files from the same sample are stored in the same slide
	indices=( ${!design[@]} )
	IFS=$'\n'
	sorted_samples=( $(echo -e "${indices[@]/%/\n}" | sed -r -e 's/^ *//' -e '/^$/d' | sort) )
	IFS=$GLOBIFS

	# Start CSV file
	cat > SampleSheet.csv_ << EOL
Sample_Name,Sample_Well,Sample_Plate,Sample_Group,Pool_ID,Sentrix_ID,Sentrix_Position
EOL

	# Each sample is linked to its original file, in the correct slide folder
	for sample in "${sorted_samples[@]}"; do
		mkdir -p slide$slidecount
		file=${design["$sample"]}
		fn=$(basename $file})

		color=${color["$sample"]}
		if [[ "$file" = /* ]]; then
			ln -f -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
		else
			ln -f -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
		fi

		cat >> SampleSheet.csv_ << EOL
${sample%%-*},,,${group["$sample"]},,slide${slidecount},${sample%%-*}
EOL

		if [[ $(( $samplecount % 24 )) -eq 0 ]]; then
			let "slidecount+=1"
		fi
		let "samplecount+=1"
		#shift
	done
	uniq SampleSheet.csv_ > SampleSheet.csv
	rm SampleSheet.csv_
fi

# Make a unique set of experimental groups
groups=$(tr ' ' '\n' <<< "${experimental_groups[@]}" | sort -u | tr '\n' ' ')
groups=($groups)

if [[ ${#groups[@]} -lt 2 ]]; then
	echo "Less then two distinct sample groups found. Please provide a sample definition file with two or more sample groups."
	exit 1
fi

# to wirte a proper config.R file for the pipeline to run
cat > config.R << EOL
source("$DIR/.Rprofile")
# GLOBAL DIRECTORIES
base = file.path("./")
results = file.path("results/")

# WORKING ENVIRONMENT BASICS
sampleInfo = read.450k.sheet(base = base, pattern="SampleSheet.csv")

# PARAMETERS
normalization="$N"
bg_noob=$B
dye_corr=$D
qc_file="$E"
det_p=$P
sample_threshold=$T
fixOutliers=$F
removeBadSamples=$M
badSampleCutoff=$L

# SAMPLE GROUPING
samples = list(
EOL
groupcount=0
for g in ${groups[@]}; do
	if [[ "$groupcount" -gt 0 ]]; then
		echo "," >> config.R
	fi
	echo "${g} = which(sampleInfo\$Sample_Group == \"${g}\")" >> config.R
	let "groupcount+=1"
done
pairwises=$(bc <<< "$groupcount*($groupcount-1)/2")
cat >> config.R << EOL
)
analysis = vector(mode = "list", length = $pairwises)
EOL
t=$(eval echo {0..$pairwises}{0..$pairwises})
i=1
for p in $t; do
	digits=($(echo $p | grep -o .))
	if [[ "${digits[1]}" -gt "${digits[0]}" && ${groups[${digits[0]}]} && ${groups[${digits[1]}]} ]]; then
		echo "analysis[[$i]] = c(\"${groups[${digits[0]}]}\", \"${groups[${digits[1]}]}\")" >> config.R
		let "i+=1"
	fi
done

#----------------------------------------------------
#
# PART 3: NORMALIZE AND ANALYSE SINGLE PROBES
#
#----------------------------------------------------
echo "Step 3 of 9: Normalizing and analysing single probes..."
# To speed up local usage, try to avoid normalization if possible
RESULTS=results
if [[ ! -d $RESULTS/$N ]]; then
	cp $DIR/.Rprofile .Rprofile
	Rscript $DIR/normalizeAndCreateTables.R >/dev/null 2>&1
	mkdir -p $RESULTS
	Rscript $DIR/singleCpGAnalysis.R >/dev/null 2>&1
fi

#----------------------------------------------------
#
# PART 4: COMBINE INTO REGIONS
#
#----------------------------------------------------
echo "Step 4 of 9: Combining probes into regions..."
COMBP=comb-p
mkdir -p $RESULTS/$N/$COMBP
for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	if [[ -d $RESULTS/$N/$r ]]; then
		continue
	fi
	mkdir -p $RESULTS/$N/$r
	for b in $RESULTS/$N/*.bed; do
		p=`basename $b .bed`
		if [ ! -f $RESULTS/$N/$p.sorted.bed ]; then
			cat $b | tr -d '\r' | bedtools sort -i - > $RESULTS/$N/$p.sorted.bed
		fi
		bedtools intersect -wa -a $reg -b $RESULTS/$N/$p.sorted.bed | bedtools sort -i - | uniq > $RESULTS/$N/$r/$r-limited.bed
		for c in $PVAL_COLS; do
			comb-p region_p -c $c -s $STEPSIZE -p $RESULTS/$N/$p.sorted.bed -r $RESULTS/$N/$r/$r-limited.bed 2> $RESULTS/$N/$COMBP/$r-$p-$c-pvals.error.bed | bedtools sort -i - >  $RESULTS/$N/$COMBP/$r-$p-$c-pvals.bed
		done
		join -t $'\t' -j 4 $RESULTS/$N/$COMBP/$r-$p-*-pvals.bed | awk 'BEGIN{OFS="\t"}{print $2,$3,$4,$1,".",$6,$7,$8,$14,$15}' > $RESULTS/$N/$r/$p.bed
		awk -v fdr="$Q" 'BEGIN{OFS="\t"}{if($8 <= fdr || $10 <= fdr){print $0}}' $RESULTS/$N/$r/$p.bed > $RESULTS/$N/$r/$p.passed_fdr_$Q.bed
	done
done

#----------------------------------------------------
#
# PART 5: CREATE VISUALIZATIONS
#
#----------------------------------------------------
echo "Step 5 of 9: Creating visualizations..."
VISUALIZATION=visualization/data-tracks
set -- ${groups[@]}
for a; do
	shift
	for b; do
		g=$a-$b
		mkdir -p $VISUALIZATION/$g/
		rm -rf $VISUALIZATION/$g/$g.prelim
		touch $VISUALIZATION/$g/$g.prelim
		for reg in "${R[@]}"; do
			r=`basename $reg .bed`
			bedtools intersect -wa -a $DIR/../data/ilmn12.hg19.bed -b $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed >> $VISUALIZATION/$g/$g.prelim

			awk -v g="$g" -v n="$r" -v a="$a" -v b="$b" 'BEGIN{print "#track gffTags=on name=\"",n," (",g,")\" color=\"54,144,192\""}{if($8 < $10){higherin=a;pval=$7;qval=$8}else{higherin=b;pval=$9;qval=$10}; print $1,$2,$3,$4";higher-in="higherin";p-value="pval";q-value="qval";chromosome="$1";start="$2";end="$3,".",".";}' $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed > $VISUALIZATION/$g/significant.$r.igv.bed
		done
		bedtools sort -i $VISUALIZATION/$g/$g.prelim | uniq > $VISUALIZATION/$g/$g.targets
		rm $VISUALIZATION/$g/$g.prelim
	done
done

Rscript $DIR/assembleMvalues.R >/dev/null 2>&1

mkdir -p $VISUALIZATION/../annotation-tracks
cp $DIR/../data/ilmn12.hg19.sorted.igv.bed $VISUALIZATION/../annotation-tracks
for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	bedtools sort -i $reg | awk -v r="$r" 'BEGIN{print "#track gffTags=on name=\"",r,"\" color=\"54,144,192\""}{print $0}' > $VISUALIZATION/../annotation-tracks/$r.igv.bed
done


#----------------------------------------------------
#
# PART 6: CREATE TABLES FOR EXCEL
#
#----------------------------------------------------
echo "Step 6 of 9: Providing tables for Excel..."
EXCEL=excel
mkdir -p $EXCEL
set -- ${groups[@]}
cat > additional_images.txt << EOL
file	group	region	chr
EOL
for a; do
        shift
        for b; do
                g=$a-$b
		for reg in "${R[@]}"; do
			r=`basename $reg .bed`
			bedtools map -c 5 -o min,median,max,count -a $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed -b $RESULTS/$N/$g.pvals.sorted.bed > $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.abs.bed
			awk -v g="$g" 'BEGIN{FS="\t";OFS=",";split(g,c,"-");print "Feature","Gene","GeneID","Type","Chr","Start","Stop","Strand","P-val "c[1],"Q-val "c[1],"P-val "c[2],"Q-val "c[2],"Higher in","with Q-val","Coverage","Min.abs.diff","Median.abs.diff","Max.abs.diff";}
{n = split($4,a,";");for(i=1; i<=n; i++){
split(a[i],b,"=");
if(b[1] == "ID"){ id = b[2];}
if(b[1] == "gene_id"){ gid = b[2];}
if(b[1] == "gene_type"){ type = b[2];}
if(b[1] == "gene_name"){ name = b[2];}
};
split(g,c,"-");
if($8 < $10){higherin=c[1];pval=$7;qval=$8}else{higherin=c[2];pval=$9;qval=$10};
print id,name,gid,type,$1,$2,$3,$6,$7,$8,$9,$10,higherin,qval,$14,$11,$12,$13;}' $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.abs.bed > $EXCEL/$g-$r.csv

			# Begin preparation of additional image rendering
			if [[ ! -s $WORKDIR/normalized/betaValues_fn.sorted.txt ]]; then
				(head -n 1 $WORKDIR/normalized/betaValues_fn.txt && tail -n +2 $WORKDIR/normalized/betaValues_fn.txt | sort) > $WORKDIR/normalized/betaValues_fn.sorted.txt
			fi
			sort -t',' -k14,14n $EXCEL/$g-$r.csv | cut -d',' -f5,6,7 | sed -e 's/,/\t/g' | tail -n +2 | head -n $I > $VISUALIZATION/$g/$r-target.bed
			split -l 1 $VISUALIZATION/$g/$r-target.bed $VISUALIZATION/$g/$r-target-
			for f in $VISUALIZATION/$g/$r-target-*; do
				bedtools intersect -wa -a $DIR/../data/ilmn12.hg19.bed -b $f | awk 'BEGIN{FS="\t";OFS="\t";}{split($4,a,";");split(a[1],b,"=");print b[2],$2}' | sort > $f.probes
				cut -f1 $f.probes | grep -f - $WORKDIR/normalized/betaValues_fn.sorted.txt > $f.grep
				cut -f2 $f.probes | paste - $f.grep > $f.txt
				chr=$(cut -f1 $f | sort | uniq | sed -e 's/\n//g')
				cat >> additional_images.txt << EOL
$f	$g	$r	$chr
EOL
				rm -rf $f $f.grep $f.probes
			done
			rm -rf $VISUALIZATION/$g/$r-target.bed
		done
	done
done
rm -rf $RESULTS/$N/*.sorted.bed

#----------------------------------------------------
#
# PART 7: CREATE ADDITIONAL IMAGES
#
#----------------------------------------------------
echo "Step 7 of 9: Rendering additional images..."
ln -s $DIR/meth_plot.R meth_plot.R
Rscript $DIR/renderAdditionalImages.R >/dev/null 2>&1

#----------------------------------------------------
#
# PART 8: WRAP-UP, PROVIDE RESULTS
#
#----------------------------------------------------
echo "Step 8 of 9: Calculating enrichment across gene sets..."
GSEA=geneset_enrichment
mkdir -p $GSEA
# build up gene sets as serialized data structure
cat ${G[@]} | awk 'BEGIN{FS="\t";OFS="";print "{"}{name=$1;genes="";for(i=3;i<=NF;i++){if(i<NF){genes=genes"\""$(i)"\","}else{genes=genes"\""$(i)"\""}};if(NR!=1){print ",\""name"\" : ["genes"]"}else{print "\""name"\" : ["genes"]"}}END{print "}"}' | tr -d '\n' > $GSEA/genesets.dict
set -- ${groups[@]}
for a; do
        shift
        for b; do
                g=$a-$b
		for reg in "${R[@]}"; do
			r=`basename $reg .bed`
			# build ranked list(s)
			tail -n +2 $EXCEL/$g-$r.csv | grep "$a" | cut -f 2,17 -d "," | grep -v '^,' |  tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$a.ranks
			tail -n +2 $EXCEL/$g-$r.csv | grep "$b" | cut -f 2,17 -d "," | grep -v '^,' | tr ',' '\t' | sort -k2,2r | uniq > $GSEA/$g-$r-$b.ranks
			if [[ -s $GSEA/$g-$r-$a.ranks || -s $GSEA/$g-$r-$b.ranks ]]; then
				python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$a
				python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$b.ranks $GSEA/$g-$r-$b
			fi
			rm -rf $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$b.ranks
		done
	done
done


#----------------------------------------------------
#
# PART 8: WRAP-UP, PROVIDE RESULTS
#
#----------------------------------------------------
echo "Step 9 of 9: Wrapping up results..."
tar -zcvf $O visualization excel >/dev/null 2>&1
echo "Done. Your results are compressed into $O."
	#!/bin/bash
	#
	# author: Jens Preußner<jens.preussner@mpi-bn.mpg.de>
	# version: 1.0.0
	#
	VERSION="1.0.0"
	GLOBIFS=$IFS
	DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
	WORKDIR=$(pwd)
	DEPENDENCIES="Rscript bedtools comb-p"

	#----------------------------------------------------
	#
	# PART 1: READ PARAMETERS AND VALIDATE THEM
	#
	#----------------------------------------------------
	# tab-separated sample definition file
	S=design.txt

	# tab-separated sample definition file
	C=SampleSheet.csv

	# Q-value FDR cutoff
	Q=0.05

	# Regions in bed format
	R=()

	# Detection p-value
	P=0.01
	T=0.5

	# Fix outliers
	F=TRUE

	# REMOVE BAD SAMPLES
	M=FALSE
	L=10.5

	# Normalization method to use
	N=fn

	# Normalization options
	D=TRUE
	B=TRUE

	# Quality control output
	E=FALSE

	# Compressed input
	Z=compressed.gz

	# Compressed output
	O=output.tar.gz

	# NUMBER OF ADDITIONAL IMAGES
	I=25

	# GENE SETS for GSEA
	G=()

	PVAL_COLS="7 8"
	STEPSIZE=100
	CHECK_Z=false

	while getopts ":hvbdflm:q:s:c:r:p:t:n:z:o:e:i:g:" opt; do
	case $opt in
	s) S=$OPTARG
	;;
	c) C=$OPTARG
	CHECK_Z=true
	;;
	q) Q=$OPTARG
	;;
	h) cat << EOL
	admire - methylation analysis ($VERSION)
	Usage: admire [options]

	Available options:
	-c Comma separated sample definition file (SampleSheet.csv)
	-s Tab separated sample definition file (design.txt)
	-z Compressed input of idat files (requires -c).
	-e Create quality control report in PDF
	-r Region file in bed format (regions.bed)
	Use multiple -r parameters to calculate for multiple region files
	-p Detection p-value to exclude probes prior to analysis (0.01)
	-t Exclude probes where more than t% samples failed according to the detection p-value. (0.5)
	-n Normalization method (fn\|swan\|noob\|illumina\|raw\|quantile)
	-b In case of functional normalization, skip noob background correction step
	-d In case of noob or functional normalization, skip dye correction step
	-f In case of quantile normalization, skip fixing outliers prior to analysis
	-l In case of quantile normalization, label samples as bad if their median signals are below a given value (10.5)
	-m In case of quantile normalization, remove bad samples
	-q Q-value cutoff for multiple testing correction (0.05)
	-i Render advanced plots for the best i regions (20)
	-g Gene set file for enrichment analysis
	Use multiple -g parameters to calculate enrichment across many gene sets
	-o tar-gz compress output into file given
	-h shows this help message.
	-v shows version information.

	Options -c and -s are mutually exclusive.
	Dependencies: Rscript, bedtools, comb-p, python
	EOL
	exit 1
	;;
	r) R+=("$OPTARG")
	;;
	g) G+=("$OPTARG")
	;;
	n)
	if [[ "$OPTARG" == "fn" \|\| "$OPTARG" == "noob" \|\| "$OPTARG" == "illumina" \|\| "$OPTARG" == "raw" \|\| "$OPTARG" == "swan" \|\| "$OPTARG" == "quantile" ]]; then
	N=$OPTARG
	else
	echo "Error: Unknown normalization method $OPTARG. Please choose from fn, swan, noob, illumina, raw or quantile."
	exit 1
	fi
	;;
	p) P=$OPTARG
	;;
	t) T=$OPTARG
	;;
	b) B=FALSE
	;;
	d) D=FALSE
	;;
	f) F=FALSE
	;;
	m) M=TRUE
	;;
	l) L=$OPTARG
	;;
	v) echo $VERSION
	;;
	z) Z=$OPTARG
	;;
	i) I=$OPTARG
	;;
	o) O=$OPTARG
	;;
	e) E=$OPTARG
	;;
	\?) echo "Invalid option -$OPTARG" >&2
	;;
	:) echo "Option -$OPTARG requires an argument." >&2
	exit 1
	;;
	esac
	done
	shift $(expr $OPTIND - 1 )

	echo "Step 1 of 9: Evaluating provided files..."

	for d in $DEPENDENCIES; do
	command -v $d >/dev/null 2>&1 \|\| { echo "Dependency error: $d required, but it's not installed. Use admire -h to show help."; exit 1; }
	done

	if [[ ! -f $S && ! -f $C ]]; then
	echo "Sample definition file $S is not a valid file. Use admire -h to show help."
	exit 1
	fi

	if [[ "$CHECK_Z" == true ]]; then
	if [[ ! -f $Z ]]; then
	echo "Mandatory file missing: -z $Z (as in use with -c $C). Use admire -h to show help."
	exit 1;
	fi
	if [[ "$C" != "SampleSheet.csv" ]]; then
	cp $C SampleSheet.csv
	fi
	fi

	for region in "${R[@]}"; do
	if [[ ! -f $region ]]; then
	echo "Region file $region is not a valid file. Use admire -h to show help."
	exit 1
	fi
	done
	#----------------------------------------------------
	#
	# PART 2: PROVIDE NECCESSARY ENVIRONMENT
	#
	#----------------------------------------------------
	echo "Step 2 of 9: Providing neccessary environment..."
	# SANITY CHECK FOR compressed input
	if [[ -f $Z && -f $C ]]; then
	CSVUSE=true
	if [[ $Z == *.zip ]]; then
	unzip $Z -d $WORKDIR >/dev/null 2>&1
	fi
	if [[ $Z == .tar.gz \|\| $Z == .tgz ]]; then
	tar -zvxf $Z -C $WORKDIR >/dev/null 2>&1
	fi
	if [[ $Z == .tar.bz2 \|\| $Z == .tbz2 \|\| $Z == *.tar.bzip2 ]]; then
	tar -jxvf $Z -C $WORKDIR >/dev/null 2>&1
	fi
	fi

	# SANITY CHECK FOR sample definition FILE
	experimental_groups=()
	if [[ $CSVUSE ]]; then
	IFS=,
	while read name well plate group pool id position _
	do
	# Check for a possible header - skip this line
	[[ "$name" == "Sample_Name" ]] && continue
	if [[ ! -f ${id}/${id}_${position}_Grn.idat \|\| ! -f ${id}/${id}_${position}_Red.idat ]]; then
	if [[ ! -f ${id}_${position}_Grn.idat \|\| ! -f ${id}_${position}_Red.idat ]]; then
	echo "One or more files are missing: ${id}/${id}_${position}_Grn.idat or ${id}/${id}_${position}_Red.idat"
	exit 1
	fi
	fi
	experimental_groups+=("$group")
	done < $C
	SKIP_DESIGN=true
	IFS=$GLOBIFS
	else
	declare -A design
	declare -A group
	declare -A color
	while read id file color group
	do
	# Check for a possible header - skip this line
	[[ "$file" == "file" ]] && continue
	# If the specified files exists, take it into account
	if [[ -f $file ]]; then
	# Fill an array with filename as key and sample group as value
	# save for later lookup
	design+=( ["$id-$color"]="$file" )
	group+=( ["$id-$color"]="$group" )
	color+=( ["$id-$color"]="$color" )
	experimental_groups+=("$group")
	else
	echo "File $file from $S does not exist."
	exit 1
	fi
	done < $S
	fi

	if [[ ! $SKIP_DESIGN ]]; then
	slidecount=1
	samplecount=1

	# Samples are sorted to ensure that the channels files from the same sample are stored in the same slide
	indices=( ${!design[@]} )
	IFS=$'\n'
	sorted_samples=( $(echo -e "${indices[@]/%/\n}" \| sed -r -e 's/^ *//' -e '/^$/d' \| sort) )
	IFS=$GLOBIFS

	# Start CSV file
	cat > SampleSheet.csv_ << EOL
	Sample_Name,Sample_Well,Sample_Plate,Sample_Group,Pool_ID,Sentrix_ID,Sentrix_Position
	EOL

	# Each sample is linked to its original file, in the correct slide folder
	for sample in "${sorted_samples[@]}"; do
	mkdir -p slide$slidecount
	file=${design["$sample"]}
	fn=$(basename $file})

	color=${color["$sample"]}
	if [[ "$file" = /* ]]; then
	ln -f -s $file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
	else
	ln -f -s ../$file slide${slidecount}/slide${slidecount}_${sample%%-*}_$color.idat
	fi

	cat >> SampleSheet.csv_ << EOL
	${sample%%-},,,${group["$sample"]},,slide${slidecount},${sample%%-}
	EOL

	if [[ $(( $samplecount % 24 )) -eq 0 ]]; then
	let "slidecount+=1"
	fi
	let "samplecount+=1"
	#shift
	done
	uniq SampleSheet.csv_ > SampleSheet.csv
	rm SampleSheet.csv_
	fi

	# Make a unique set of experimental groups
	groups=$(tr ' ' '\n' <<< "${experimental_groups[@]}" \| sort -u \| tr '\n' ' ')
	groups=($groups)

	if [[ ${#groups[@]} -lt 2 ]]; then
	echo "Less then two distinct sample groups found. Please provide a sample definition file with two or more sample groups."
	exit 1
	fi

	# to wirte a proper config.R file for the pipeline to run
	cat > config.R << EOL
	source("$DIR/.Rprofile")
	# GLOBAL DIRECTORIES
	base = file.path("./")
	results = file.path("results/")

	# WORKING ENVIRONMENT BASICS
	sampleInfo = read.450k.sheet(base = base, pattern="SampleSheet.csv")

	# PARAMETERS
	normalization="$N"
	bg_noob=$B
	dye_corr=$D
	qc_file="$E"
	det_p=$P
	sample_threshold=$T
	fixOutliers=$F
	removeBadSamples=$M
	badSampleCutoff=$L

	# SAMPLE GROUPING
	samples = list(
	EOL
	groupcount=0
	for g in ${groups[@]}; do
	if [[ "$groupcount" -gt 0 ]]; then
	echo "," >> config.R
	fi
	echo "${g} = which(sampleInfo\$Sample_Group == \"${g}\")" >> config.R
	let "groupcount+=1"
	done
	pairwises=$(bc <<< "$groupcount*($groupcount-1)/2")
	cat >> config.R << EOL
	)
	analysis = vector(mode = "list", length = $pairwises)
	EOL
	t=$(eval echo {0..$pairwises}{0..$pairwises})
	i=1
	for p in $t; do
	digits=($(echo $p \| grep -o .))
	if [[ "${digits[1]}" -gt "${digits[0]}" && ${groups[${digits[0]}]} && ${groups[${digits[1]}]} ]]; then
	echo "analysis[[$i]] = c(\"${groups[${digits[0]}]}\", \"${groups[${digits[1]}]}\")" >> config.R
	let "i+=1"
	fi
	done

	#----------------------------------------------------
	#
	# PART 3: NORMALIZE AND ANALYSE SINGLE PROBES
	#
	#----------------------------------------------------
	echo "Step 3 of 9: Normalizing and analysing single probes..."
	# To speed up local usage, try to avoid normalization if possible
	RESULTS=results
	if [[ ! -d $RESULTS/$N ]]; then
	cp $DIR/.Rprofile .Rprofile
	Rscript $DIR/normalizeAndCreateTables.R >/dev/null 2>&1
	mkdir -p $RESULTS
	Rscript $DIR/singleCpGAnalysis.R >/dev/null 2>&1
	fi

	#----------------------------------------------------
	#
	# PART 4: COMBINE INTO REGIONS
	#
	#----------------------------------------------------
	echo "Step 4 of 9: Combining probes into regions..."
	COMBP=comb-p
	mkdir -p $RESULTS/$N/$COMBP
	for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	if [[ -d $RESULTS/$N/$r ]]; then
	continue
	fi
	mkdir -p $RESULTS/$N/$r
	for b in $RESULTS/$N/*.bed; do
	p=`basename $b .bed`
	if [ ! -f $RESULTS/$N/$p.sorted.bed ]; then
	cat $b \| tr -d '\r' \| bedtools sort -i - > $RESULTS/$N/$p.sorted.bed
	fi
	bedtools intersect -wa -a $reg -b $RESULTS/$N/$p.sorted.bed \| bedtools sort -i - \| uniq > $RESULTS/$N/$r/$r-limited.bed
	for c in $PVAL_COLS; do
	comb-p region_p -c $c -s $STEPSIZE -p $RESULTS/$N/$p.sorted.bed -r $RESULTS/$N/$r/$r-limited.bed 2> $RESULTS/$N/$COMBP/$r-$p-$c-pvals.error.bed \| bedtools sort -i - > $RESULTS/$N/$COMBP/$r-$p-$c-pvals.bed
	done
	join -t $'\t' -j 4 $RESULTS/$N/$COMBP/$r-$p-*-pvals.bed \| awk 'BEGIN{OFS="\t"}{print $2,$3,$4,$1,".",$6,$7,$8,$14,$15}' > $RESULTS/$N/$r/$p.bed
	awk -v fdr="$Q" 'BEGIN{OFS="\t"}{if($8 <= fdr \|\| $10 <= fdr){print $0}}' $RESULTS/$N/$r/$p.bed > $RESULTS/$N/$r/$p.passed_fdr_$Q.bed
	done
	done

	#----------------------------------------------------
	#
	# PART 5: CREATE VISUALIZATIONS
	#
	#----------------------------------------------------
	echo "Step 5 of 9: Creating visualizations..."
	VISUALIZATION=visualization/data-tracks
	set -- ${groups[@]}
	for a; do
	shift
	for b; do
	g=$a-$b
	mkdir -p $VISUALIZATION/$g/
	rm -rf $VISUALIZATION/$g/$g.prelim
	touch $VISUALIZATION/$g/$g.prelim
	for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	bedtools intersect -wa -a $DIR/../data/ilmn12.hg19.bed -b $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed >> $VISUALIZATION/$g/$g.prelim

	awk -v g="$g" -v n="$r" -v a="$a" -v b="$b" 'BEGIN{print "#track gffTags=on name=\"",n," (",g,")\" color=\"54,144,192\""}{if($8 < $10){higherin=a;pval=$7;qval=$8}else{higherin=b;pval=$9;qval=$10}; print $1,$2,$3,$4";higher-in="higherin";p-value="pval";q-value="qval";chromosome="$1";start="$2";end="$3,".",".";}' $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed > $VISUALIZATION/$g/significant.$r.igv.bed
	done
	bedtools sort -i $VISUALIZATION/$g/$g.prelim \| uniq > $VISUALIZATION/$g/$g.targets
	rm $VISUALIZATION/$g/$g.prelim
	done
	done

	Rscript $DIR/assembleMvalues.R >/dev/null 2>&1

	mkdir -p $VISUALIZATION/../annotation-tracks
	cp $DIR/../data/ilmn12.hg19.sorted.igv.bed $VISUALIZATION/../annotation-tracks
	for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	bedtools sort -i $reg \| awk -v r="$r" 'BEGIN{print "#track gffTags=on name=\"",r,"\" color=\"54,144,192\""}{print $0}' > $VISUALIZATION/../annotation-tracks/$r.igv.bed
	done


	#----------------------------------------------------
	#
	# PART 6: CREATE TABLES FOR EXCEL
	#
	#----------------------------------------------------
	echo "Step 6 of 9: Providing tables for Excel..."
	EXCEL=excel
	mkdir -p $EXCEL
	set -- ${groups[@]}
	cat > additional_images.txt << EOL
	file group region chr
	EOL
	for a; do
	shift
	for b; do
	g=$a-$b
	for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	bedtools map -c 5 -o min,median,max,count -a $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.bed -b $RESULTS/$N/$g.pvals.sorted.bed > $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.abs.bed
	awk -v g="$g" 'BEGIN{FS="\t";OFS=",";split(g,c,"-");print "Feature","Gene","GeneID","Type","Chr","Start","Stop","Strand","P-val "c[1],"Q-val "c[1],"P-val "c[2],"Q-val "c[2],"Higher in","with Q-val","Coverage","Min.abs.diff","Median.abs.diff","Max.abs.diff";}
	{n = split($4,a,";");for(i=1; i<=n; i++){
	split(a[i],b,"=");
	if(b[1] == "ID"){ id = b[2];}
	if(b[1] == "gene_id"){ gid = b[2];}
	if(b[1] == "gene_type"){ type = b[2];}
	if(b[1] == "gene_name"){ name = b[2];}
	};
	split(g,c,"-");
	if($8 < $10){higherin=c[1];pval=$7;qval=$8}else{higherin=c[2];pval=$9;qval=$10};
	print id,name,gid,type,$1,$2,$3,$6,$7,$8,$9,$10,higherin,qval,$14,$11,$12,$13;}' $RESULTS/$N/$r/$g.pvals.passed_fdr_$Q.abs.bed > $EXCEL/$g-$r.csv

	# Begin preparation of additional image rendering
	if [[ ! -s $WORKDIR/normalized/betaValues_fn.sorted.txt ]]; then
	(head -n 1 $WORKDIR/normalized/betaValues_fn.txt && tail -n +2 $WORKDIR/normalized/betaValues_fn.txt \| sort) > $WORKDIR/normalized/betaValues_fn.sorted.txt
	fi
	sort -t',' -k14,14n $EXCEL/$g-$r.csv \| cut -d',' -f5,6,7 \| sed -e 's/,/\t/g' \| tail -n +2 \| head -n $I > $VISUALIZATION/$g/$r-target.bed
	split -l 1 $VISUALIZATION/$g/$r-target.bed $VISUALIZATION/$g/$r-target-
	for f in $VISUALIZATION/$g/$r-target-*; do
	bedtools intersect -wa -a $DIR/../data/ilmn12.hg19.bed -b $f \| awk 'BEGIN{FS="\t";OFS="\t";}{split($4,a,";");split(a[1],b,"=");print b[2],$2}' \| sort > $f.probes
	cut -f1 $f.probes \| grep -f - $WORKDIR/normalized/betaValues_fn.sorted.txt > $f.grep
	cut -f2 $f.probes \| paste - $f.grep > $f.txt
	chr=$(cut -f1 $f \| sort \| uniq \| sed -e 's/\n//g')
	cat >> additional_images.txt << EOL
	$f $g $r $chr
	EOL
	rm -rf $f $f.grep $f.probes
	done
	rm -rf $VISUALIZATION/$g/$r-target.bed
	done
	done
	done
	rm -rf $RESULTS/$N/*.sorted.bed

	#----------------------------------------------------
	#
	# PART 7: CREATE ADDITIONAL IMAGES
	#
	#----------------------------------------------------
	echo "Step 7 of 9: Rendering additional images..."
	ln -s $DIR/meth_plot.R meth_plot.R
	Rscript $DIR/renderAdditionalImages.R >/dev/null 2>&1

	#----------------------------------------------------
	#
	# PART 8: WRAP-UP, PROVIDE RESULTS
	#
	#----------------------------------------------------
	echo "Step 8 of 9: Calculating enrichment across gene sets..."
	GSEA=geneset_enrichment
	mkdir -p $GSEA
	# build up gene sets as serialized data structure
	cat ${G[@]} \| awk 'BEGIN{FS="\t";OFS="";print "{"}{name=$1;genes="";for(i=3;i<=NF;i++){if(i<NF){genes=genes"\""$(i)"\","}else{genes=genes"\""$(i)"\""}};if(NR!=1){print ",\""name"\" : ["genes"]"}else{print "\""name"\" : ["genes"]"}}END{print "}"}' \| tr -d '\n' > $GSEA/genesets.dict
	set -- ${groups[@]}
	for a; do
	shift
	for b; do
	g=$a-$b
	for reg in "${R[@]}"; do
	r=`basename $reg .bed`
	# build ranked list(s)
	tail -n +2 $EXCEL/$g-$r.csv \| grep "$a" \| cut -f 2,17 -d "," \| grep -v '^,' \| tr ',' '\t' \| sort -k2,2r \| uniq > $GSEA/$g-$r-$a.ranks
	tail -n +2 $EXCEL/$g-$r.csv \| grep "$b" \| cut -f 2,17 -d "," \| grep -v '^,' \| tr ',' '\t' \| sort -k2,2r \| uniq > $GSEA/$g-$r-$b.ranks
	if [[ -s $GSEA/$g-$r-$a.ranks \|\| -s $GSEA/$g-$r-$b.ranks ]]; then
	python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$a
	python $DIR/gsea.py $GSEA/genesets.dict $GSEA/$g-$r-$b.ranks $GSEA/$g-$r-$b
	fi
	rm -rf $GSEA/$g-$r-$a.ranks $GSEA/$g-$r-$b.ranks
	done
	done
	done


	#----------------------------------------------------
	#
	# PART 8: WRAP-UP, PROVIDE RESULTS
	#
	#----------------------------------------------------
	echo "Step 9 of 9: Wrapping up results..."
	tar -zcvf $O visualization excel >/dev/null 2>&1
	echo "Done. Your results are compressed into $O."