Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
August/Adapting_new_script_for_computing_cluster.sh
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
188 lines (140 sloc)
5.84 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Performs alignment and filtering of CUT&Tag data | |
# FIXME: | |
# picard.jar should be available from some | |
# program directory. | |
# Downloaded from https://github.com/broadinstitute/picard/releases/tag/2.27.4 | |
PICARD=/project/RNAseq_Mdl_data/Konrad/August/picard.jar | |
# inputs | |
fastq_dir=/project/RNAseq_Mdl_data/Konrad/August/fastq # Fastq directory | |
ebwt=/project/MDL_ChIPseq/data/genome/bowtie2_index/mm10/mm10 # Built genome directory (bowtie_build2) | |
blist=/project/RNAseq_Mdl_data/Konrad/mm10.bl.bed | |
# outputs | |
work_dir="/project/RNAseq_Mdl_data/Konrad/August" # Output Directory ("peaks" and "bigwig") | |
log_dir=/project/RNAseq_Mdl_data/Konrad/August/log # cluster job log files | |
# parameter | |
THREADS=32 | |
MXQSUB=(mxqsub) | |
MXQSUB+=(--group-name devel) | |
MXQSUB+=(--runtime 10h) | |
MXQSUB+=(--processors $THREADS) | |
MXQSUB+=(--memory 500G) | |
MXQSUB+=(--tmpdir 500G) | |
LOCK_FILE=/project/RNAseq_Mdl_data/Konrad/donald/.clusterio.lock | |
usage() { | |
echo "usage: $0 (--submit | --execute ID) " >&2 | |
exit 1 | |
} | |
submit() { | |
mkdir -p $log_dir | |
shopt -s nullglob | |
# input files "blabla_R_1.fastq" , "blabla_R_2.fastq" | |
# | |
# --> id "blabla_R" ( "_R" included for historical reasons ) | |
# --> f1 "blabla_R_1.fastq" | |
# --> f2 "blabla_R_2.fastq" | |
# | |
for f1 in $fastq_dir/*_R_1.fastq; do | |
id=$(basename ${f1%%_1.fastq}) | |
f1=${id}_1.fastq | |
f2=${id}_2.fastq | |
if [ ! -e $fastq_dir/$f2 ]; then | |
echo "no _2.fastq file for $f1" | |
continue | |
fi | |
"${MXQSUB[@]}" --stdout "$log_dir/$id.log" "$0" --execute "$id" "$f1" "$f2" | |
done | |
# input files "blabla_R1_001.fastq" , ""blabla_R2_001.fastq" | |
# | |
# --> id "blabla" | |
# --> f1 "blabla_R1_001.fastq" | |
# --> f2 "blabla_R2_001.fastq" | |
for f1 in $fastq_dir/*_R1_001.fastq; do | |
id=$(basename ${f1%%_R1_001.fastq}) | |
f1=${id}_R1_001.fastq | |
f2=${id}_R2_001.fastq | |
if [ ! -e $fastq_dir/$f2 ]; then | |
echo "no _R2_001.fastq file for $f1" | |
continue | |
fi | |
"${MXQSUB[@]}" --stdout "$log_dir/$id.log" "$0" --execute "$id" "$f1" "$f2" | |
done | |
} | |
execute() { | |
set -xe | |
id="$1" | |
f1="$2" | |
f2="$3" | |
cd $MXQ_JOB_TMPDIR | |
mkdir fastq | |
id="$id" fastq_dir="$fastq_dir" flock $LOCK_FILE bash -c 'cp $fastq_dir/$id* fastq/' | |
#################################### | |
mkdir data final_bam peaks bigwig fastqc dedup | |
: Running FastQC on $id | |
/project/RNAseq_Mdl_data/Konrad/FastQC/fastqc fastq/$f1 --outdir fastqc | |
/project/RNAseq_Mdl_data/Konrad/FastQC/fastqc fastq/$f2 --outdir fastqc | |
: Trimming $id with trim_galore | |
prun python3 trim_galore --paired --nextera -j $THREADS fastq/$f1 fastq/$f2 > $id.trimmingStats.txt 2>&1 | |
: Mapping $id with bowtie2 | |
bowtie2 --very-sensitive-local --no-mixed -p $THREADS --no-discordant --phred33 -I 10 -X 2000 -x $ebwt -1 ${f1%%.fastq}_val_1.fq -2 ${f2%%.fastq}_val_2.fq \ | |
-S $id.sam >> $id\.mappingStats.txt 2>&1 | |
: Creating BAM files for $id and filtering for paired and mapped reads | |
samtools view -b -h -f 2 -q 20 -@ $THREADS $id.sam > $id.bam | |
: Sorting BAM files for $id | |
samtools sort -m 1G -@ $THREADS $id.bam -T ${id}_sorted -o ${id}_sorted.bam | |
: Removing Blacklisted regions from $id | |
bedtools intersect -v -a ${id}_sorted.bam -b $blist > final_bam/${id}_sorted_blacklisted.bam | |
samtools index final_bam/${id}_sorted_blacklisted.bam | |
: Removing duplicates from ${id} using PICARD | |
java -jar $PICARD MarkDuplicates INPUT=final_bam/${id}_sorted_blacklisted.bam OUTPUT=dedup/${id}_dedup.bam METRICS_FILE=dedup/${id}_dedupMetric.txt VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE | |
samtools index dedup/${id}_dedup.bam | |
: Calling peaks for $id using macs2 | |
macs2 callpeak -t final_bam/${id}_sorted_blacklisted.bam -f BAMPE -g mm -q 0.05 -n $id \ | |
--outdir peaks | |
macs2 callpeak -t dedup/${id}_dedup.bam -f BAMPE -g mm -q 0.05 -n ${id} \ | |
--outdir dedup_peaks | |
: Creating BIGWIG tracks for $id | |
# note: this is single-threaded but takes long. Put this into separate cluster job? | |
prun python3 bamCoverage -b final_bam/${id}_sorted_blacklisted.bam -o bigwig/$id.bw -bs 10 -e \ | |
--normalizeUsing CPM -ignore chrX chrY \ | |
--numberOfProcessors $THREADS | |
prun python3 bamCoverage -b dedup/${id}_dedup.bam -o bigwig/${id}_dedup.bw -bs 10 -e \ | |
--normalizeUsing CPM -ignore chrX chrY \ | |
--numberOfProcessors $THREADS \ | |
: Generating QC files for the data | |
fragments=$(grep -Po -m 1 'Total reads processed.*' $id.trimmingStats.txt | grep -Po '[0-9,]*' | tr -d ,) | |
map=$(grep -Po '[0-9, /.]*% overall alignment rate' $id.mappingStats.txt | grep -Po '[0-9,/.]*') | |
mapFrac=$(printf "%.4f" $(echo $map / 100 | bc -l)) | |
dedup=$(printf "%.4f" $(grep -Po 'Unknown Library.*' dedup/${id}_dedupMetric.txt | awk '{print $10}')) | |
filtered=$(( $(samtools view -c dedup/${id}_dedup.bam) / 2 )) | |
fripReads=$(( $(bedtools intersect -a dedup/${id}_dedup.bam -b dedup_peaks/${id}_peaks.narrowPeak -u -ubam | samtools view -c) / 2 )) | |
FRiP=$(printf "%.4f" $(echo $fripReads / $filtered | bc -l)) | |
peaks=$(( $(wc -l <"dedup_peaks/${id}_peaks.narrowPeak") - 1 )) | |
echo -e "$id\t$fragments\t$filtered\t$mapFrac\t$dedup\t$FRiP\t$peaks" >> cnt_gata_qc_metrics.txt | |
# Here I have no idea how to transfer this | |
: Generating pearson correlation between the samples | |
#################################### | |
work_dir="$work_dir" flock $LOCK_FILE bash -c ' | |
mkdir -p $work_dir/peaks $work_dir/bigwig $work_dir/final_bam $work_dir/fastqc $work_dir/dedup $work_dir/dedup_peaks \ | |
$work_dir/qc | |
cp final_bam/* $work_dir/final_bam | |
cp peaks/* $work_dir/peaks/ | |
cp bigwig/* $work_dir/bigwig/ | |
cp fastqc/* $work_dir/fastqc | |
cp dedup/* $work_dir/dedup | |
cp dedup_peaks/* $work_dir/dedup_peaks | |
test -e $work_dir/qc/cnt_gata_qc_metrics.txt \ | |
|| echo -e "Sample\tTotal\tFiltered\tMapped\tDuplicate\tFRiP\tPeaks"> $work_dir/qc/cnt_gata_qc_metrics.txt | |
cat cnt_gata_qc_metrics.txt >> $work_dir/qc/cnt_gata_qc_metrics.txt | |
' | |
# Just for the log file | |
ls -lR | |
du -hs $MAX_JOB_TMPDIR | |
} | |
if [ $# = 1 -a "$1" = --submit ]; then | |
submit | |
elif [ $# = 4 -a "$1" = --execute ]; then | |
execute "$2" "$3" "$4" | |
else | |
usage | |
fi |