Skip to content
Permalink
71fd137e1c
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 188 lines (140 sloc) 5.84 KB
#!/bin/bash
# Performs alignment and filtering of CUT&Tag data
# FIXME:
# picard.jar should be available from some
# program directory.
# Downloaded from https://github.com/broadinstitute/picard/releases/tag/2.27.4
PICARD=/project/RNAseq_Mdl_data/Konrad/August/picard.jar
# inputs
fastq_dir=/project/RNAseq_Mdl_data/Konrad/August/fastq # Fastq directory
ebwt=/project/MDL_ChIPseq/data/genome/bowtie2_index/mm10/mm10 # Built genome directory (bowtie_build2)
blist=/project/RNAseq_Mdl_data/Konrad/mm10.bl.bed
# outputs
work_dir="/project/RNAseq_Mdl_data/Konrad/August" # Output Directory ("peaks" and "bigwig")
log_dir=/project/RNAseq_Mdl_data/Konrad/August/log # cluster job log files
# parameter
THREADS=32
MXQSUB=(mxqsub)
MXQSUB+=(--group-name devel)
MXQSUB+=(--runtime 10h)
MXQSUB+=(--processors $THREADS)
MXQSUB+=(--memory 500G)
MXQSUB+=(--tmpdir 500G)
LOCK_FILE=/project/RNAseq_Mdl_data/Konrad/donald/.clusterio.lock
usage() {
echo "usage: $0 (--submit | --execute ID) " >&2
exit 1
}
submit() {
mkdir -p $log_dir
shopt -s nullglob
# input files "blabla_R_1.fastq" , "blabla_R_2.fastq"
#
# --> id "blabla_R" ( "_R" included for historical reasons )
# --> f1 "blabla_R_1.fastq"
# --> f2 "blabla_R_2.fastq"
#
for f1 in $fastq_dir/*_R_1.fastq; do
id=$(basename ${f1%%_1.fastq})
f1=${id}_1.fastq
f2=${id}_2.fastq
if [ ! -e $fastq_dir/$f2 ]; then
echo "no _2.fastq file for $f1"
continue
fi
"${MXQSUB[@]}" --stdout "$log_dir/$id.log" "$0" --execute "$id" "$f1" "$f2"
done
# input files "blabla_R1_001.fastq" , ""blabla_R2_001.fastq"
#
# --> id "blabla"
# --> f1 "blabla_R1_001.fastq"
# --> f2 "blabla_R2_001.fastq"
for f1 in $fastq_dir/*_R1_001.fastq; do
id=$(basename ${f1%%_R1_001.fastq})
f1=${id}_R1_001.fastq
f2=${id}_R2_001.fastq
if [ ! -e $fastq_dir/$f2 ]; then
echo "no _R2_001.fastq file for $f1"
continue
fi
"${MXQSUB[@]}" --stdout "$log_dir/$id.log" "$0" --execute "$id" "$f1" "$f2"
done
}
execute() {
set -xe
id="$1"
f1="$2"
f2="$3"
cd $MXQ_JOB_TMPDIR
mkdir fastq
id="$id" fastq_dir="$fastq_dir" flock $LOCK_FILE bash -c 'cp $fastq_dir/$id* fastq/'
####################################
mkdir data final_bam peaks bigwig fastqc dedup
: Running FastQC on $id
/project/RNAseq_Mdl_data/Konrad/FastQC/fastqc fastq/$f1 --outdir fastqc
/project/RNAseq_Mdl_data/Konrad/FastQC/fastqc fastq/$f2 --outdir fastqc
: Trimming $id with trim_galore
prun python3 trim_galore --paired --nextera -j $THREADS fastq/$f1 fastq/$f2 > $id.trimmingStats.txt 2>&1
: Mapping $id with bowtie2
bowtie2 --very-sensitive-local --no-mixed -p $THREADS --no-discordant --phred33 -I 10 -X 2000 -x $ebwt -1 ${f1%%.fastq}_val_1.fq -2 ${f2%%.fastq}_val_2.fq \
-S $id.sam >> $id\.mappingStats.txt 2>&1
: Creating BAM files for $id and filtering for paired and mapped reads
samtools view -b -h -f 2 -q 20 -@ $THREADS $id.sam > $id.bam
: Sorting BAM files for $id
samtools sort -m 1G -@ $THREADS $id.bam -T ${id}_sorted -o ${id}_sorted.bam
: Removing Blacklisted regions from $id
bedtools intersect -v -a ${id}_sorted.bam -b $blist > final_bam/${id}_sorted_blacklisted.bam
samtools index final_bam/${id}_sorted_blacklisted.bam
: Removing duplicates from ${id} using PICARD
java -jar $PICARD MarkDuplicates INPUT=final_bam/${id}_sorted_blacklisted.bam OUTPUT=dedup/${id}_dedup.bam METRICS_FILE=dedup/${id}_dedupMetric.txt VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=TRUE
samtools index dedup/${id}_dedup.bam
: Calling peaks for $id using macs2
macs2 callpeak -t final_bam/${id}_sorted_blacklisted.bam -f BAMPE -g mm -q 0.05 -n $id \
--outdir peaks
macs2 callpeak -t dedup/${id}_dedup.bam -f BAMPE -g mm -q 0.05 -n ${id} \
--outdir dedup_peaks
: Creating BIGWIG tracks for $id
# note: this is single-threaded but takes long. Put this into separate cluster job?
prun python3 bamCoverage -b final_bam/${id}_sorted_blacklisted.bam -o bigwig/$id.bw -bs 10 -e \
--normalizeUsing CPM -ignore chrX chrY \
--numberOfProcessors $THREADS
prun python3 bamCoverage -b dedup/${id}_dedup.bam -o bigwig/${id}_dedup.bw -bs 10 -e \
--normalizeUsing CPM -ignore chrX chrY \
--numberOfProcessors $THREADS \
: Generating QC files for the data
fragments=$(grep -Po -m 1 'Total reads processed.*' $id.trimmingStats.txt | grep -Po '[0-9,]*' | tr -d ,)
map=$(grep -Po '[0-9, /.]*% overall alignment rate' $id.mappingStats.txt | grep -Po '[0-9,/.]*')
mapFrac=$(printf "%.4f" $(echo $map / 100 | bc -l))
dedup=$(printf "%.4f" $(grep -Po 'Unknown Library.*' dedup/${id}_dedupMetric.txt | awk '{print $10}'))
filtered=$(( $(samtools view -c dedup/${id}_dedup.bam) / 2 ))
fripReads=$(( $(bedtools intersect -a dedup/${id}_dedup.bam -b dedup_peaks/${id}_peaks.narrowPeak -u -ubam | samtools view -c) / 2 ))
FRiP=$(printf "%.4f" $(echo $fripReads / $filtered | bc -l))
peaks=$(( $(wc -l <"dedup_peaks/${id}_peaks.narrowPeak") - 1 ))
echo -e "$id\t$fragments\t$filtered\t$mapFrac\t$dedup\t$FRiP\t$peaks" >> cnt_gata_qc_metrics.txt
# Here I have no idea how to transfer this
: Generating pearson correlation between the samples
####################################
work_dir="$work_dir" flock $LOCK_FILE bash -c '
mkdir -p $work_dir/peaks $work_dir/bigwig $work_dir/final_bam $work_dir/fastqc $work_dir/dedup $work_dir/dedup_peaks \
$work_dir/qc
cp final_bam/* $work_dir/final_bam
cp peaks/* $work_dir/peaks/
cp bigwig/* $work_dir/bigwig/
cp fastqc/* $work_dir/fastqc
cp dedup/* $work_dir/dedup
cp dedup_peaks/* $work_dir/dedup_peaks
test -e $work_dir/qc/cnt_gata_qc_metrics.txt \
|| echo -e "Sample\tTotal\tFiltered\tMapped\tDuplicate\tFRiP\tPeaks"> $work_dir/qc/cnt_gata_qc_metrics.txt
cat cnt_gata_qc_metrics.txt >> $work_dir/qc/cnt_gata_qc_metrics.txt
'
# Just for the log file
ls -lR
du -hs $MAX_JOB_TMPDIR
}
if [ $# = 1 -a "$1" = --submit ]; then
submit
elif [ $# = 4 -a "$1" = --execute ]; then
execute "$2" "$3" "$4"
else
usage
fi