Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
net-seq-pipeline/NetSeqPipeline
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
494 lines (414 sloc)
18.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# ./NETSeqPipeline -o PathToOutputFiles -f fastqfile1.fq fastqfile2.fq ... fastqfileX.fq | |
# -f | --files [fqFile1.fq ... fqFileX.fq]: variable number of sequencing data in fastq format; NO DEFAULT | |
# -o | --output [PathToOutputFiles]: relative or absolut output path; DEFAULT: current directory | |
# -r | --reference [Assembly]: name of reference genome; DEFAULT:"GRCh38" | |
# -id | --sampleID [ID]: identification string; DEFAULT:"UNKNOWN" | |
# | |
# -c | --compression [Method]: either "bz2","gz", "none"; DEFAULT:"none" | |
# -p | --parameterFile [Parameters.in]: STAR mapper parameter file; NO DEFAULT | |
# | |
# -sa | --star [PathToStarApplication]: relative or absolut path to STAR mapper + name of executable; NO DEFAULT | |
# -si | --star-index [PathToStarIndices]: relative or absolut path to STAR indices, DEFAULT: PathToStarApplication "/index" | |
# -st | --samtools [PathToSamtoolsApplication]: relative or absolut path to samtool application + name of executable; NO DEFAULT | |
# -bt | --bedtools [PathToBadtoolsApplication]: relative or absolut path to samtool application + name of executable; NO DEFAULT | |
# | |
### requirements | |
# | |
# * python 2.7 with moduls pysam, HTSeq | |
# * bzip2 or gzip if file is compressed, respectively | |
# * STAR (2.5.3a) | |
# * samtools | |
# * bedtools | |
# * wget | |
# * curl | |
# | |
### remarks | |
# | |
# * make sure you have write rights on -o PathToOutputFiles | |
# * make sure you have read rights on -f fastq files | |
# * internet connection and write right in application file | |
############################################################################################################################## | |
############################################################################################################################## | |
################################################ setting default values #################################################### | |
echo "################################# setting default values" | |
workDir=$(pwd) | |
outDir=$workDir | |
absolutPath=$(readlink -f "$0") | |
scriptDir=$(dirname "$absolutPath") | |
compression=none | |
reference=GRCh38 | |
star_index=$star/index | |
sampleID=Unknown | |
############################################################################################################################## | |
############################################################################################################################## | |
###################################### Read input arguments and set customer variables ##################################### | |
echo "################################# read customer values" | |
while [[ $# -gt 0 ]] | |
do | |
key="$1" | |
case $key in | |
-o|--output) | |
outDir="$2" | |
shift 2 # past argument | |
;; | |
-id|--sampleID) | |
sampleID="$2" | |
shift 2 | |
;; | |
-c|--compression) | |
compression="$2" | |
shift 2 | |
;; | |
-sa|--star) | |
star="$2" | |
shift 2 | |
;; | |
-bt|--bedtools) | |
bedtools="$2" | |
shift 2 | |
;; | |
-st|--samtools) | |
samtools="$2" | |
shift 2 | |
;; | |
-p|--parameterFile) | |
pFile="$2" | |
shift 2 | |
;; | |
-si|--star-index) | |
star_index="$2" | |
shift 2 | |
;; | |
-r|--reference) | |
reference="$2" | |
shift 2 | |
;; | |
-f|--files) | |
numFiles=0 | |
FILES="" | |
shift 1 | |
while [[ $1 =~ .*\.fq || $1 =~ .*\.fastq ]] | |
do | |
if [[ $FILES != "" ]]; | |
then | |
FILES="$FILES\n" | |
fi | |
FILES="$FILES$1" | |
numFiles=$((numFiles+1)) | |
shift 1 | |
done | |
;; | |
*) | |
status="### $key isn't a valid input argument" | |
echo $status | |
exit 0 | |
;; | |
esac | |
done | |
# output current variables | |
echo "# Sample identification = $sampleID" | |
echo "# Number of input files = $numFiles" | |
echo -e "# File paths = \n$FILES" | |
echo "# Output = $outDir" | |
echo "# Reference = $reference" | |
echo "# Compression = $compression" | |
echo "# STAR path = $star" | |
echo "# STAR index = $star_index" | |
echo "# Samtools = $samtools" | |
FILES=`echo -e $FILES` | |
mkdir -p $outDir | |
echo "################################# Check if input file(s) exist ... " | |
for f in $FILES | |
do | |
if [ ! -f $f ];then | |
status="### $f doesn't exist" | |
echo $status | |
exit 0 | |
fi | |
done | |
############################################################################################################################## | |
############################################################################################################################## | |
###################################### Decompress files and save new filename in variable $FILES ########################### | |
echo "################################# Decompress Files ... " | |
temp=$outDir/TemporaryData | |
mkdir -p $temp | |
if [ "$compression" != "none" ];then | |
FILES_TEMP="" | |
for f in $FILES | |
do | |
base=`basename $f` | |
name=${base%.*} # contain .fqTEMP | |
echo "# Decompress file "$f | |
fq=$temp/$name | |
case $f in | |
*.bz2) | |
# check if decompresed file already exist | |
if [ ! -f $fq ];then | |
echo "bunzip2:$f" | |
bunzip2 -c $f -k > $fq | |
fi | |
;; | |
*.gz) | |
# check if decompresed file already exist | |
if [ ! -f $fq ];then | |
echo "gunzip:$f" | |
gunzip -c $f -k > $fq | |
fi | |
;; | |
*) | |
status="### $f is already decompressed or compression type is not supportet" | |
echo $status | |
exit 0 | |
;; | |
esac | |
if [[ $FILES_TEMP != "" ]];then | |
FILES_TEMP="$FILES_TEMP\n" | |
fi | |
FILES_TEMP="$FILES_TEMP${fq}" | |
done | |
# new filename(s) without compressed suffix in $FILES | |
FILES=`echo -e $FILES_TEMP` | |
fi | |
############################################################################################################################## | |
############################################################################################################################## | |
######################################################## Molecular Barcode Extraction ######################################### | |
# Description: | |
# removal of 6 first nucleotides (molecular barcode) from FastQ files | |
# creates 2 additional files containing the molecular barcode/ligation hexamer counts | |
# | |
# Program: | |
# extractMolecularBarcode.py | |
# | |
echo "################################# Extracting Molecular Barcode... " | |
for f in $FILES | |
do | |
base=`basename $f` | |
name=${base%.*} | |
reads=$temp/$name | |
echo "# Processing file $base" | |
# check if decompresed file without barcode already exist | |
if [ ! -f ${reads}_noBarcode.fastq ];then | |
python $scriptDir/source/extractMolecularBarcode.py $f ${reads}_noBarcode.fastq ${reads}_barcodeDistribution.txt ${reads}_split_ligationDistribution.txt $outDir/lenReads.txt | |
fi | |
done | |
############################################################################################################################## | |
############################################################################################################################## | |
######################################################## Download Data ################################################### | |
# Description: | |
# * download sequence and annotation data | |
# * generate info file for further handling in NETSeq pipeline | |
# Necessary for: | |
# 1) generating genome index | |
# 2) generating splicing intermediates | |
# Program: | |
# wget, curl | |
# | |
echo "################################# Download data ... " | |
sequences=$scriptDir/data/sequences | |
annotation=$scriptDir/data/annotation | |
info=$scriptDir/data/info.stdout | |
mkdir -p $scriptDir/data | |
mkdir -p $sequences | |
mkdir -p $annotation | |
# ckeck if files exist, if not downloadData.sh will download data | |
# REMARK: download of sequence and annotation data maybe fail if links are expired | |
bash $scriptDir/source/downloadData.sh $sequences $annotation $reference $scriptDir $info | |
GENOME=`sed '1q;d' $info` | |
ANNOTATION=`sed '2q;d' $info` | |
PREFIX=`sed '3q;d' $info` | |
############################################################################################################################## | |
############################################################################################################################## | |
######################################################## Generating Genome Index ############################################ | |
# Description: | |
# STAR run building required genome indices ... | |
# Required for mapping | |
# Program: | |
# STAR | |
# | |
echo "################################# Build genome index ... " | |
idxDir=$star_index/${reference} | |
mkdir -p $idxDir | |
# check if index alredy exist | |
if [ -z "$(ls -A $idxDir)" ]; then | |
$star --runThreadN 40 --runMode genomeGenerate --genomeDir $idxDir \ | |
--genomeFastaFiles $GENOME $sequences/rDNA.Genbank.U13369.1.fa \ | |
$sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa $sequences/${reference}.tRNA.CCA.fa | |
fi | |
############################################################################################################################## | |
############################################################################################################################## | |
######################################################## Alignment ########################################################## | |
# Description: | |
# STAR run on reads not containing/containing molecular barcodes ... | |
# Required for further Reverse Transcriptase (RT) bias removal | |
# | |
# Program: | |
# STAR | |
# | |
echo "################################# Aligning reads containing or not Molecular Barcode ... " | |
mapping_dir=$temp/StarMapping | |
mkdir -p $mapping_dir | |
for f in $FILES | |
do | |
base=`basename $f` | |
name=${base%.*} | |
mapping=$mapping_dir/$name | |
echo "# Mapping file "$base | |
mkdir -p $mapping | |
# check if mapping is alredy done | |
if [ -z "$(ls -A $mapping)" ];then | |
reads=$temp/$name | |
$star --genomeDir $idxDir --readFilesIn ${reads}_noBarcode.fastq --parametersFiles $pFile --runThreadN 20 --outFileNamePrefix $mapping/${name}_ | |
fi | |
mkdir -p ${mapping}_withBC | |
# check if mapping is alredy done | |
if [ -z "$(ls -A ${mapping}_withBC)" ];then | |
$star --genomeDir $idxDir --readFilesIn $f --parametersFiles $pFile --runThreadN 20 --outFileNamePrefix ${mapping}_withBC/${name}_ | |
fi | |
done | |
############################################################################################################################## | |
############################################################################################################################## | |
########################################## Reverse transcriptase mispriming event removal #################################### | |
# Description: | |
# to remove artefactual reads (arising from RT) | |
# that are mapping even when we leave the barcode sequence | |
# Program: | |
# samtools | |
# extractReadsWithMismatchesIn6FirstNct.py | |
# | |
echo "################################# Removing reads with RT bias ... " | |
bam_dir=$temp/BamPostProcessing | |
mkdir -p $bam_dir | |
for f in $FILES | |
do | |
base=`basename $f` | |
name=${base%.*} | |
mapping=$mapping_dir/$name | |
echo "# Remove Reverse Transcription from file "$base | |
Bam1=$mapping/${name}_Aligned.sortedByCoord.out.bam | |
Bam2=${mapping}_withBC/${name}_Aligned.sortedByCoord.out.bam | |
bam=$bam_dir/$name | |
mkdir -p $bam | |
if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done | |
$scriptDir/source/rtRemoval.sh $bam $name $samtools $Bam1 $Bam2 $scriptDir | |
fi | |
done | |
############################################################################################################################## | |
############################################################################################################################## | |
########################################## Extracting Pol2 coverage track #################################################### | |
# Description: | |
# Monitors whole read coverage and pol II position | |
# Generates bedgraph file | |
# The *_pos.bedGraph files contain info from gene in the pos strand, and inversely for the *_neg.bedGraph files | |
# Program: | |
# samtools | |
# customCoverage.py | |
# | |
echo "################################# Extracting Pol2 coverage track ... " | |
cov_dir=$temp/CoverageTrack | |
mkdir -p $cov_dir | |
for f in $FILES | |
do | |
base=`basename $f` | |
name=${base%.*} | |
echo "# Processing file "$base | |
cov=$cov_dir/$name | |
mkdir -p $cov | |
uniqueBam=$bam_dir/$name/${name}_noRTbias.sorted_uniq.bam | |
if [ ! -f $cov/${name}_uniq_noRTbias_stt_neg.bedGraph ] || [ ! -f $cov/${name}_uniq_noRTbias_stt_pos.bedGraph ];then # check if analysis already done | |
bash $scriptDir/source/coverage.sh $samtools $uniqueBam $scriptDir $cov/${name}_uniq_noRTbias_ | |
fi | |
done | |
############################################################################################################################## | |
############################################################################################################################## | |
###################### Prepare bedGraph and bam files for the PCR duplication step ################################## | |
# Description: | |
# generates bedGraph and bam files for the PCR duplication step | |
# Program: | |
# joinBedGraph.r | |
# samtools | |
# bedtools | |
# | |
echo "################################# Join bedGraph and bam files for the PCR duplication step ... " | |
if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph ] || [ ! -f $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai ];then | |
if [ "$numFiles" -gt "1" ];then | |
BAMFILES="" | |
BEDGRAPHPOS="" | |
BEDGRAPHNEG="" | |
for f in $FILES | |
do | |
base=`basename $f` | |
name=${base%.*} | |
BAMFILES="$BAMFILES $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam" | |
$bedtools sort -i $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph > $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph.temp | |
$bedtools sort -i $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph > $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph.temp | |
cp $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph.temp $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph | |
cp $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph.temp $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph | |
BEDGRAPHPOS="$BEDGRAPHPOS $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph" | |
BEDGRAPHNEG="$BEDGRAPHNEG $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph" | |
done | |
if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp ];then | |
$bedtools unionbedg -i $BEDGRAPHPOS > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp | |
$bedtools unionbedg -i $BEDGRAPHNEG > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp | |
Rscript $scriptDir/source/joinBedGraph.r $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph | |
Rscript $scriptDir/source/joinBedGraph.r $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph | |
fi | |
# join bam file for PCR duplicate analysis | |
if [ ! -f $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai ];then | |
$samtools merge -f $bam_dir/${sampleID}_noRTbias.uniq.bam $BAMFILES | |
$samtools sort $bam_dir/${sampleID}_noRTbias.uniq.bam $bam_dir/${sampleID}_noRTbias.sorted_uniq | |
$samtools index $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam | |
fi | |
else | |
base=`basename $FILES` | |
echo $FILES | |
name=${base%.*} | |
echo "# Processing File "$base | |
Rscript $scriptDir/source/joinBedGraph.r $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph | |
Rscript $scriptDir/source/joinBedGraph.r $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph | |
cp $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam | |
cp $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam.bai $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai | |
fi | |
fi | |
############################################################################################################################## | |
############################################################################################################################## | |
########################### Calculate Splicing Intermediates (SI) Coordinates ################################################ | |
# Description: | |
# extract splicing intermediate positions from annotation files | |
# Program: | |
# getSICoordinates1Based.r | |
# | |
echo "################################# Calculate Splicing Intermediates (SI) Coordinates ... " | |
SIfileReference=$annotation/SI_coordinates_${reference}_1based.txt | |
if [ ! -f $SIfileReference ];then | |
bash Rscript $scriptDir/source/getSICoordinates1Based.r $ANNOTATION $SIfileReference | |
fi | |
# | |
############################################################################################################################## | |
############################################################################################################################## | |
###################### Remove PCR duplicates and Splicing intermediates from Bam and BedGraph files ########################## | |
# Description: | |
# to remove artefactual reads (arising from PCR duplicates) | |
# that have the same genomic position and molecular barcode than another read | |
# Program: | |
# samtools | |
# removePCRduplicateStringent.py : a python script that extract reads arising from PCR duplication events | |
# | |
echo "################################# Removing PCR duplicates and Splicing intermediates from Bam and BedGraph files ... " | |
echo "# Processing ${sampleID}" | |
if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph ];then | |
bash $scriptDir/source/removePCRduplicates.sh $cov_dir $bam_dir $scriptDir $SIfileReference $sampleID $samtools $idxDir $bedtools | |
fi | |
############################################################################################################################## | |
############################################################################################################################## | |
################### Split reference and filter chromosomal pol2 coverage ######################### | |
echo "################################# Split reference and filter chromosomal pol2 coverage ... " | |
result_dir=$outDir/Bedgraph | |
raw_dir=$result_dir/RawBedgraph | |
mkdir -p $result_dir | |
mkdir -p $raw_dir | |
grep -P "^chr[0-9,X,Y]*\t" $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph | $bedtools sort -i - > $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph | |
grep -P "^chr[0-9,X,Y]*\t" $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph | $bedtools sort -i - > $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph | |
cp $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph $raw_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph | |
cp $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph $raw_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph | |