From 60a4f2bc4d2f130341f0554c79c8e9f7e8eb5800 Mon Sep 17 00:00:00 2001 From: Martyna Gajos Date: Fri, 17 Nov 2017 15:53:55 +0100 Subject: [PATCH] net-seq-pipeline --- NetSeqPipeline | 494 ++++++++++++++++++ README.md | 28 + parameter_STAR.in | 480 +++++++++++++++++ parameter_dependencies.in | 25 + parameter_template.in | 31 ++ runNetSeqPipe.netPipe | 13 + source/appendCCAinFasta.r | 18 + source/coverage.sh | 17 + source/customCoverage.py | 32 ++ source/downloadData.sh | 80 +++ source/extractMolecularBarcode.py | 87 +++ .../extractReadsWithMismatchesIn6FirstNct.py | 40 ++ source/getSICoordinates1Based.r | 50 ++ source/joinBedGraph.r | 29 + source/normalizeBedgraph.sh | 43 ++ source/normalizedBedgraph.r | 13 + source/removePCRduplicateStringent.py | 72 +++ source/removePCRduplicates.sh | 150 ++++++ source/rtRemoval.sh | 52 ++ 19 files changed, 1754 insertions(+) create mode 100644 NetSeqPipeline create mode 100644 README.md create mode 100644 parameter_STAR.in create mode 100644 parameter_dependencies.in create mode 100644 parameter_template.in create mode 100644 runNetSeqPipe.netPipe create mode 100755 source/appendCCAinFasta.r create mode 100755 source/coverage.sh create mode 100755 source/customCoverage.py create mode 100755 source/downloadData.sh create mode 100755 source/extractMolecularBarcode.py create mode 100755 source/extractReadsWithMismatchesIn6FirstNct.py create mode 100755 source/getSICoordinates1Based.r create mode 100755 source/joinBedGraph.r create mode 100755 source/normalizeBedgraph.sh create mode 100755 source/normalizedBedgraph.r create mode 100755 source/removePCRduplicateStringent.py create mode 100755 source/removePCRduplicates.sh create mode 100755 source/rtRemoval.sh diff --git a/NetSeqPipeline b/NetSeqPipeline new file mode 100644 index 0000000..8c007c8 --- /dev/null +++ b/NetSeqPipeline @@ -0,0 +1,494 @@ +#!/usr/bin/env bash +# +# ./NETSeqPipeline -o PathToOutputFiles -f fastqfile1.fq fastqfile2.fq ... fastqfileX.fq +# -f | --files [fqFile1.fq ... fqFileX.fq]: variable number of sequencing data in fastq format; NO DEFAULT +# -o | --output [PathToOutputFiles]: relative or absolut output path; DEFAULT: current directory +# -r | --reference [Assembly]: name of reference genome; DEFAULT:"GRCh38" +# -id | --sampleID [ID]: identification string; DEFAULT:"UNKNOWN" +# +# -c | --compression [Method]: either "bz2","gz", "none"; DEFAULT:"none" +# -p | --parameterFile [Parameters.in]: STAR mapper parameter file; NO DEFAULT +# +# -sa | --star [PathToStarApplication]: relative or absolut path to STAR mapper + name of executable; NO DEFAULT +# -si | --star-index [PathToStarIndices]: relative or absolut path to STAR indices, DEFAULT: PathToStarApplication "/index" +# -st | --samtools [PathToSamtoolsApplication]: relative or absolut path to samtool application + name of executable; NO DEFAULT +# -bt | --bedtools [PathToBadtoolsApplication]: relative or absolut path to samtool application + name of executable; NO DEFAULT +# +### requirements +# +# * python 2.7 with moduls pysam, HTSeq +# * bzip2 or gzip if file is compressed, respectively +# * STAR (2.5.3a) +# * samtools +# * bedtools +# * wget +# * curl +# +### remarks +# +# * make sure you have write rights on -o PathToOutputFiles +# * make sure you have read rights on -f fastq files +# * internet connection and write right in application file + +############################################################################################################################## +############################################################################################################################## +################################################ setting default values #################################################### + +echo "################################# setting default values" + +workDir=$(pwd) +outDir=$workDir +absolutPath=$(readlink -f "$0") +scriptDir=$(dirname "$absolutPath") +compression=none +reference=GRCh38 +star_index=$star/index +sampleID=Unknown + +############################################################################################################################## +############################################################################################################################## +###################################### Read input arguments and set customer variables ##################################### + +echo "################################# read customer values" + +while [[ $# -gt 0 ]] +do +key="$1" +case $key in + -o|--output) + outDir="$2" + shift 2 # past argument + ;; + + -id|--sampleID) + sampleID="$2" + shift 2 + ;; + + -c|--compression) + compression="$2" + shift 2 + ;; + + -sa|--star) + star="$2" + shift 2 + ;; + + -bt|--bedtools) + bedtools="$2" + shift 2 + ;; + + -st|--samtools) + samtools="$2" + shift 2 + ;; + + -p|--parameterFile) + pFile="$2" + shift 2 + ;; + + -si|--star-index) + star_index="$2" + shift 2 + ;; + + -r|--reference) + reference="$2" + shift 2 + ;; + + -f|--files) + numFiles=0 + FILES="" + shift 1 + while [[ $1 =~ .*\.fq || $1 =~ .*\.fastq ]] + do + if [[ $FILES != "" ]]; + then + FILES="$FILES\n" + fi + FILES="$FILES$1" + numFiles=$((numFiles+1)) + shift 1 + done + ;; + + *) + status="### $key isn't a valid input argument" + echo $status + exit 0 + ;; + +esac +done + +# output current variables + +echo "# Sample identification = $sampleID" +echo "# Number of input files = $numFiles" +echo -e "# File paths = \n$FILES" +echo "# Output = $outDir" +echo "# Reference = $reference" +echo "# Compression = $compression" +echo "# STAR path = $star" +echo "# STAR index = $star_index" +echo "# Samtools = $samtools" + + +FILES=`echo -e $FILES` +mkdir -p $outDir + +echo "################################# Check if input file(s) exist ... " + +for f in $FILES +do + if [ ! -f $f ];then + status="### $f doesn't exist" + echo $status + exit 0 + fi +done + +############################################################################################################################## +############################################################################################################################## +###################################### Decompress files and save new filename in variable $FILES ########################### + +echo "################################# Decompress Files ... " + +temp=$outDir/TemporaryData +mkdir -p $temp + +if [ "$compression" != "none" ];then + FILES_TEMP="" + for f in $FILES + do + base=`basename $f` + name=${base%.*} # contain .fqTEMP + echo "# Decompress file "$f + fq=$temp/$name + case $f in + *.bz2) + # check if decompresed file already exist + if [ ! -f $fq ];then + echo "bunzip2:$f" + bunzip2 -c $f -k > $fq + fi + ;; + *.gz) + # check if decompresed file already exist + if [ ! -f $fq ];then + echo "gunzip:$f" + gunzip -c $f -k > $fq + fi + ;; + *) + status="### $f is already decompressed or compression type is not supportet" + echo $status + exit 0 + ;; + esac + if [[ $FILES_TEMP != "" ]];then + FILES_TEMP="$FILES_TEMP\n" + fi + FILES_TEMP="$FILES_TEMP${fq}" + done + # new filename(s) without compressed suffix in $FILES + FILES=`echo -e $FILES_TEMP` +fi + +############################################################################################################################## +############################################################################################################################## +######################################################## Molecular Barcode Extraction ######################################### +# Description: +# removal of 6 first nucleotides (molecular barcode) from FastQ files +# creates 2 additional files containing the molecular barcode/ligation hexamer counts +# +# Program: +# extractMolecularBarcode.py +# + +echo "################################# Extracting Molecular Barcode... " + +for f in $FILES +do + base=`basename $f` + name=${base%.*} + reads=$temp/$name + echo "# Processing file $base" + # check if decompresed file without barcode already exist + if [ ! -f ${reads}_noBarcode.fastq ];then + python $scriptDir/source/extractMolecularBarcode.py $f ${reads}_noBarcode.fastq ${reads}_barcodeDistribution.txt ${reads}_split_ligationDistribution.txt $outDir/lenReads.txt + fi +done + +############################################################################################################################## +############################################################################################################################## +######################################################## Download Data ################################################### +# Description: +# * download sequence and annotation data +# * generate info file for further handling in NETSeq pipeline +# Necessary for: +# 1) generating genome index +# 2) generating splicing intermediates +# Program: +# wget, curl +# + +echo "################################# Download data ... " + +sequences=$scriptDir/data/sequences +annotation=$scriptDir/data/annotation +info=$scriptDir/data/info.stdout + +mkdir -p $scriptDir/data +mkdir -p $sequences +mkdir -p $annotation + +# ckeck if files exist, if not downloadData.sh will download data +# REMARK: download of sequence and annotation data maybe fail if links are expired +bash $scriptDir/source/downloadData.sh $sequences $annotation $reference $scriptDir $info + +GENOME=`sed '1q;d' $info` +ANNOTATION=`sed '2q;d' $info` +PREFIX=`sed '3q;d' $info` + +############################################################################################################################## +############################################################################################################################## +######################################################## Generating Genome Index ############################################ +# Description: +# STAR run building required genome indices ... +# Required for mapping +# Program: +# STAR +# + +echo "################################# Build genome index ... " + +idxDir=$star_index/${reference} +mkdir -p $idxDir +# check if index alredy exist +if [ -z "$(ls -A $idxDir)" ]; then + $star --runThreadN 40 --runMode genomeGenerate --genomeDir $idxDir \ + --genomeFastaFiles $GENOME $sequences/rDNA.Genbank.U13369.1.fa \ + $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa $sequences/${reference}.tRNA.CCA.fa +fi + + +############################################################################################################################## +############################################################################################################################## +######################################################## Alignment ########################################################## +# Description: +# STAR run on reads not containing/containing molecular barcodes ... +# Required for further Reverse Transcriptase (RT) bias removal +# +# Program: +# STAR +# + +echo "################################# Aligning reads containing or not Molecular Barcode ... " + +mapping_dir=$temp/StarMapping +mkdir -p $mapping_dir + +for f in $FILES +do + base=`basename $f` + name=${base%.*} + mapping=$mapping_dir/$name + echo "# Mapping file "$base + mkdir -p $mapping + # check if mapping is alredy done + if [ -z "$(ls -A $mapping)" ];then + reads=$temp/$name + $star --genomeDir $idxDir --readFilesIn ${reads}_noBarcode.fastq --parametersFiles $pFile --runThreadN 20 --outFileNamePrefix $mapping/${name}_ + fi + + mkdir -p ${mapping}_withBC + # check if mapping is alredy done + if [ -z "$(ls -A ${mapping}_withBC)" ];then + $star --genomeDir $idxDir --readFilesIn $f --parametersFiles $pFile --runThreadN 20 --outFileNamePrefix ${mapping}_withBC/${name}_ + fi +done + +############################################################################################################################## +############################################################################################################################## +########################################## Reverse transcriptase mispriming event removal #################################### +# Description: +# to remove artefactual reads (arising from RT) +# that are mapping even when we leave the barcode sequence +# Program: +# samtools +# extractReadsWithMismatchesIn6FirstNct.py +# + +echo "################################# Removing reads with RT bias ... " + +bam_dir=$temp/BamPostProcessing +mkdir -p $bam_dir + +for f in $FILES +do + base=`basename $f` + name=${base%.*} + mapping=$mapping_dir/$name + echo "# Remove Reverse Transcription from file "$base + + Bam1=$mapping/${name}_Aligned.sortedByCoord.out.bam + Bam2=${mapping}_withBC/${name}_Aligned.sortedByCoord.out.bam + + bam=$bam_dir/$name + mkdir -p $bam + + if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done + $scriptDir/source/rtRemoval.sh $bam $name $samtools $Bam1 $Bam2 $scriptDir + fi +done + +############################################################################################################################## +############################################################################################################################## +########################################## Extracting Pol2 coverage track #################################################### +# Description: +# Monitors whole read coverage and pol II position +# Generates bedgraph file +# The *_pos.bedGraph files contain info from gene in the pos strand, and inversely for the *_neg.bedGraph files +# Program: +# samtools +# customCoverage.py +# + + +echo "################################# Extracting Pol2 coverage track ... " + +cov_dir=$temp/CoverageTrack +mkdir -p $cov_dir + +for f in $FILES +do + base=`basename $f` + name=${base%.*} + echo "# Processing file "$base + cov=$cov_dir/$name + mkdir -p $cov + + uniqueBam=$bam_dir/$name/${name}_noRTbias.sorted_uniq.bam + + if [ ! -f $cov/${name}_uniq_noRTbias_stt_neg.bedGraph ] || [ ! -f $cov/${name}_uniq_noRTbias_stt_pos.bedGraph ];then # check if analysis already done + bash $scriptDir/source/coverage.sh $samtools $uniqueBam $scriptDir $cov/${name}_uniq_noRTbias_ + fi +done + +############################################################################################################################## +############################################################################################################################## +###################### Prepare bedGraph and bam files for the PCR duplication step ################################## +# Description: +# generates bedGraph and bam files for the PCR duplication step +# Program: +# joinBedGraph.r +# samtools +# bedtools +# + +echo "################################# Join bedGraph and bam files for the PCR duplication step ... " + +if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph ] || [ ! -f $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai ];then + if [ "$numFiles" -gt "1" ];then + BAMFILES="" + BEDGRAPHPOS="" + BEDGRAPHNEG="" + for f in $FILES + do + base=`basename $f` + name=${base%.*} + BAMFILES="$BAMFILES $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam" + $bedtools sort -i $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph > $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph.temp + $bedtools sort -i $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph > $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph.temp + cp $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph.temp $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph + cp $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph.temp $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph + BEDGRAPHPOS="$BEDGRAPHPOS $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph" + BEDGRAPHNEG="$BEDGRAPHNEG $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph" + done + + if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp ];then + $bedtools unionbedg -i $BEDGRAPHPOS > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp + $bedtools unionbedg -i $BEDGRAPHNEG > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp + Rscript $scriptDir/source/joinBedGraph.r $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph + Rscript $scriptDir/source/joinBedGraph.r $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph + fi + + # join bam file for PCR duplicate analysis + if [ ! -f $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai ];then + $samtools merge -f $bam_dir/${sampleID}_noRTbias.uniq.bam $BAMFILES + $samtools sort $bam_dir/${sampleID}_noRTbias.uniq.bam $bam_dir/${sampleID}_noRTbias.sorted_uniq + $samtools index $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam + fi + else + base=`basename $FILES` + echo $FILES + name=${base%.*} + echo "# Processing File "$base + + Rscript $scriptDir/source/joinBedGraph.r $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph + Rscript $scriptDir/source/joinBedGraph.r $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph + + cp $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam + cp $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam.bai $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai + fi +fi + +############################################################################################################################## +############################################################################################################################## +########################### Calculate Splicing Intermediates (SI) Coordinates ################################################ +# Description: +# extract splicing intermediate positions from annotation files +# Program: +# getSICoordinates1Based.r +# + +echo "################################# Calculate Splicing Intermediates (SI) Coordinates ... " + +SIfileReference=$annotation/SI_coordinates_${reference}_1based.txt + +if [ ! -f $SIfileReference ];then + bash Rscript $scriptDir/source/getSICoordinates1Based.r $ANNOTATION $SIfileReference +fi + +# +############################################################################################################################## +############################################################################################################################## +###################### Remove PCR duplicates and Splicing intermediates from Bam and BedGraph files ########################## +# Description: +# to remove artefactual reads (arising from PCR duplicates) +# that have the same genomic position and molecular barcode than another read +# Program: +# samtools +# removePCRduplicateStringent.py : a python script that extract reads arising from PCR duplication events +# + +echo "################################# Removing PCR duplicates and Splicing intermediates from Bam and BedGraph files ... " + +echo "# Processing ${sampleID}" + +if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph ];then + bash $scriptDir/source/removePCRduplicates.sh $cov_dir $bam_dir $scriptDir $SIfileReference $sampleID $samtools $idxDir $bedtools +fi + +############################################################################################################################## +############################################################################################################################## +################### Split reference and filter chromosomal pol2 coverage ######################### + +echo "################################# Split reference and filter chromosomal pol2 coverage ... " + +result_dir=$outDir/Bedgraph +raw_dir=$result_dir/RawBedgraph + +mkdir -p $result_dir +mkdir -p $raw_dir + +grep -P "^chr[0-9,X,Y]*\t" $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph | $bedtools sort -i - > $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph +grep -P "^chr[0-9,X,Y]*\t" $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph | $bedtools sort -i - > $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph +cp $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph $raw_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph +cp $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph $raw_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph + diff --git a/README.md b/README.md new file mode 100644 index 0000000..93b2029 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# NET-seq Pipeline + +## REQUIREMENTS +- bedtools +- curl +- python 2.7, moduls: + - HTSeq + - pysam +- R +- samtools +- STAR (2.5.3a) +- wget +- bzip2 or gzip if file is compressed, respectively + +## STEPS +1. Molecular Barcode Extraction +2. Reads mapping (both sets: with and without barcodes) +3. Reverse transcriptase mispriming events removal +4. Recording of the 5' end position (3' end position of the original nascent RNA) +5. PCR duplicates removal +6. Removal of sequencing reads due to splicing intermediates + +## USAGE +1. Fill in the parameter files: parameter_dependencies.in and parameter_template.in +2. Make the file runNetSeqPipe.netPipe executable + chmod -x runNetSeqPipe.netPipe +3. Run the runNetSeqPipe.netPipe file + bash runNetSeqPipe.netPipe diff --git a/parameter_STAR.in b/parameter_STAR.in new file mode 100644 index 0000000..ff3f71b --- /dev/null +++ b/parameter_STAR.in @@ -0,0 +1,480 @@ +#### all parameteres default are found at /groups/churchman/jd187/Program/STAR/STAR-STAR_2.4.0d/parametersDefault file ############## + +### versions +#versionSTAR 020201 +# int>0: STAR release numeric ID. Please do not change this value! +#versionGenome 020101 020200 +# int>0: oldest value of the Genome version compatible with this STAR release. Please do not change this value! + +### PARAMETERS +#parametersFiles - +# string: name of a user-defined parameters file, "-": none. Can only be defined on the command line. +# provided at the command line level + +### RUN PARAMETERS +#runMode alignReads +# string: type of the run: alignReads ... map reads +# genomeGenerate ... generate genome files +# inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType to generate wiggle files. + +#runThreadN 1 +# int: number of threads to run STAR +# provided at the command line level in case we need to tweek the memory settings etc + +### GENOME PARAMETERS +#genomeDir ./GenomeDir/ +# string: path to the directory where genome files are stored (if runMode!=generateGenome) or will be generated (if runMode==generateGenome) +# provided at the command line level, in case we wanna provide a diff genome (f.ex. built without Jct) without changing the param file + +#genomeLoad NoSharedMemory +# mode of shared memory usage for the genome files +# string: LoadAndKeep ... load genome into shared and keep it in memory after run +# LoadAndRemove ... load genome into shared but remove it after run +# LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs +# Remove ... do not map anything, just remove loaded genome from memory +# NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + +### GENOME GENERATION PARAMETERS +#genomeFastaFiles - +# fasta files with genomic sequence for genome files generation, only used if runMode==genomeGenerate +# string(s): path(s) to the files from the working directory (separated by spaces) + +#genomeChrBinNbits 18 +# int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins + +#genomeSAindexNbases 14 +# int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. + +#genomeSAsparseD 1 +# int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + + +### INPUT FROM BAM +#inputBAMfile - +# string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + + +### READ PARAMETERS +#readFilesIn Read1 Read2 +# string(s): paths to files that contain input read1 (and, if needed, read2) +# given at the command line level as the read names is gonna depend on the sample name + +readFilesCommand cat +# string(s): cmd line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout +# For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +#readMapNumber -1 +# int: number of reads to map from the beginning of the file +# -1: map all reads + +#readMatesLengthsIn NotEqual +# string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +#clip3pNbases 0 +# int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. +# +#clip5pNbases 0 +# int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq ATCTCGTATGCCGTCTTCTGCTTG +# string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterMMp 0.21 +# double(s): max proportion of mismatches for 3p adpater clipping for each mate. If one value is given, it will be assumed the same for both mates. +# + +clip3pAfterAdapterNbases 1 +# int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + + + +### LIMITS +#limitGenomeGenerateRAM 31000000000 +# int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 200000000 +# int>0: max available buffers size (bytes) for input/output, per thread + +#limitOutSAMoneReadBytes 100000 +#int>0: max size of the SAM record for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax) + +#limitOutSJoneRead 1000 +# int>0: max number of junctions for one read (including all multi-mappers) + +#limitOutSJcollapsed 1000000 +# int>0: max number of collapsed junctions + +limitBAMsortRAM 64000000000 +# int>=0: maximum available RAM for sorting BAM. If =0, it will be set to the genome index size + + +### OUTPUT: GENERAL +#outFileNamePrefix ./ +# string: output files name prefix (including full or relative path). Can only be defined on the command line. +# we define it on the command line as it is depending on the sample name + +#outTmpDir - +# string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! +# - the temp directory will default to outFileNamePrefix_tmp + +#outStd Log +# string: which output will be directed to stdout (standard out) +# Log : log messages +# SAM : alignments in .sam format (which normally are output to Aligned.out.sam file), normal std output will go into Log.std.out +# BAM_Unsorted : alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted +# BAM_SortedByCoordinate : alignments in BAM format, unsorted. Requires --outSAMtype BAM SortedByCoordinate +# BAM_Quant : alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped Fastx +# string: output of unmapped reads (besides SAM) +# None : no output +# Fastx : output in separate fasta/fastq files, Unmapped.out.mate1/2 + +#outQSconversionAdd 0 +# int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + + +### OUTPUT: SAM/BAM +outSAMtype BAM SortedByCoordinate +# strings: type of SAM/BAM output +# 1st word: +# BAM : output BAM without sorting +# SAM : output SAM without sorting +# None : no SAM/BAM output +# 2nd, 3rd ... +# Unsorted : standard unsorted +# SortedByCoordinate : sorted by coordinate + +#outSAMmode Full +# string: mode of SAM output None : no SAM output +# Full : full SAM output +# NoQS : full SAM but without quality scores + +#outSAMstrandField None +# string: Cufflinks-like strand field flag None : not used +# intronMotif : strand derived from the intron motif. Reads with inconsisent and/or non-canonical introns are filtered out. + +#If you have stranded RNA-seq data, you do not need to use any specific STAR options. Instead, you need +#to run Cufflinks with the library option --library-type options. For example, cufflinks <…> - +#library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has +#to be used only for Cufflinks runs and not for STAR runs. + +outSAMattributes All +# string: a string of desired SAM attributes, in the order desired for the output SAM +# NH HI AS nM NM MD jM jI XS +# Standard : NH HI AS nM +# All : NH HI AS nM NM MD jM jI +# None + + +#outSAMunmapped None +# string: output of unmapped reads in the SAM format +# None : no output +# Within : output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + +#outSAMorder Paired +# string: type of sorting for the SAM output +# Paired: one mate after the other for all paired alignments +# PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +#outSAMprimaryFlag OneBestScore +# string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG +# OneBestScore : only one alignment with the best score is primary +# AllBestScore : all alignments with the best score are primary + +#outSAMreadID Standard +# string: read ID record type +# Standard : first word (until space) from the FASTx read ID line, removing /1,/2 from the end +# Number : read number (index) in the FASTx file + +#outSAMmapqUnique 255 +# int: 0 to 255: the MAPQ value for unique mappers + +#outSAMattrRGline - +# string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", +# e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". +# xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. +# Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. + +#outSAMheaderHD - +# strings: @HD (header) line of the SAM header + +#outSAMheaderPG - +# strings: extra @PG (software) line of the SAM header (in addition to STAR) +# +#outSAMheaderCommentFile - +# string: path to the file with @CO (comment) lines of the SAM header +# +#outBAMcompression -1 +# int: -1 ... 10 BAM compression level, -1=default compression, 0=no compression, 10=maximum compression +# +#bamRemoveDuplicatesType - +# string: remove duplicates from BAM file, for now only works with sorted BAM feeded with inputBAMfile +# UniqueIdentical : removes all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical +# +#bamRemoveDuplicatesMate2basesN 0 +# int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) +# + +### OUTPUT WIGGLE +#outWigType None +# string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p" +# 1st word: +# None +# bedGraph +# wiggle +# 2nd word: +# read1_5p : signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc +# read2 : signal from only 2nd read +# +#outWigStrand Stranded +# string: strandedness of wigglle (bedGraph) output +# Stranded: separate strands, str1 and str2 +# Unstranded: collapsed strands +# +#outWigReferencesPrefix - +# string: prefix matching reference names to include in the output wiggle file, e.g. "chr.*" +# default: "-" - include all references +# +#outWigNorm RPM +# string: type of normalization for the signal +# RPM : reads per million of mapped reads +# None : no normalization, "raw" counts +# + + +### OUTPUT FILTERING +#outFilterType Normal +# string: type of filtering +# Normal: standard filtering using only current alignment +# BySJout: keep only those reads that contain junctions that passed filtering into SJ.out.tab + +#outFilterMultimapScoreRange 1 +# int: the score range below the maximum score for multimapping alignments. +# as -10*log10(1-1/2) = MapQual for read mapping twice = 3.013, 3.013 -range 3 +# keep all multi aligned reads (at least up to 100 times as -10*log10(1-1/100) = +# 0.04364805)... I think... Not sure I got it right though... + +outFilterMultimapNmax 2 +# int: read alignments will be output only if the read maps fewer than this value, otherwise no alignments will be output + +#outFilterMismatchNmax 10 +# int: alignment will be output only if it has fewer mismatches + +#outFilterMismatchNoverLmax 0.3 +# int: alignment will be output only if its ratio of mismatches to *mapped* length is less than this value; to make it consistent with parmater used with TH + +#outFilterMismatchNoverReadLmax 1 +# int: alignment will be output only if its ratio of mismatches to *read* length is less than this value + +#outFilterScoreMin 0 +# int: alignment will be output only if its score is higher than this value + +#outFilterScoreMinOverLread 0.66 +# float: outFilterScoreMin normalized to read length (sum of mates' lengths for paired-end reads) + +#outFilterMatchNmin 0 +# int: alignment will be output only if the number of matched bases is higher than this value; to make it consistent with parmater used with TH + +#outFilterMatchNminOverLread 0.66 +# float: outFilterMatchNmin normalized to read length (sum of mates' lengths for paired-end reads) +# I guess it has something to do with soft clipping ? I don't know, so I let the default value... + +#outFilterIntronMotifs None +# string: filter alignment using their motifs +# None : no filtering +# RemoveNoncanonical : filter out alignments that contain non-canonical junctions +# RemoveNoncanonicalUnannotated : filter out alignments that contain non-canonical unannotated junctions when using annotated +# splice junctions database. The annotated non-canonical junctions will be kept. + +### OUTPUT FILTERING: SPLICE JUNCTIONS +#outSJfilterReads All +# string: which reads to consider for collapsed splice junctions output +# All: all reads, unique- and multi-mappers +# Unique: uniquely mapping reads only + +outSJfilterOverhangMin 3 1 1 1 +# 4*int: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG motif, +# (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif does not apply to annotated junctions + +#outSJfilterCountUniqueMin 3 1 1 1 +# 4*int: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif, +# (4) AT/AC motif. -1 means no output for that motif. Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions + +#outSJfilterCountTotalMin 3 1 1 1 +# 4*int: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 0 0 0 0 +# 4*int>=0: minimum allowed distance to other junctions' donor/acceptor +# does not apply to annotated junctions + +#outSJfilterIntronMaxVsReadN 50000 100000 200000 +# N*int>=0: maximum gap allowed for junctions supported by 1,2,3...N reads +# i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 +# reads any gap <=alignIntronMax does not apply to annotated junctions + +### SCORING +#scoreGap 0 +# gap open penalty + +#scoreGapNoncan -8 +# non-canonical gap open penalty (in addition to scoreGap) + +#scoreGapGCAG -4 +# GCAG gap open penalty (in addition to scoreGap) + +#scoreGapATAC -8 +# ATAC gap open penalty (in addition to scoreGap) + +#scoreGenomicLengthLog2scale -0.25 +# extra score logarithmically scaled with genomic length of the alignment: -scoreGenomicLengthLog2scale*log2(genomicLength) + +#scoreDelOpen -2 +# deletion open penalty + +#scoreDelBase -2 +# deletion extension penalty per base (in addition to scoreDelOpen) + +#scoreInsOpen -2 +# insertion open penalty + +#scoreInsBase -2 +# insertion extension penalty per base (in addition to scoreInsOpen) + +#scoreStitchSJshift 1 +# maximum score reduction while searching for SJ boundaries inthe stitching step + + +### ALIGNMENT and SEED PARAMETERS +#seedSearchStartLmax 50 +# int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +#seedSearchStartLmaxOverLread 1.0 +# float: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +#seedSearchLmax 0 +# int>=0: defines the maximum length of the seeds, if =0 max seed lengthis infinite + +#seedMultimapNmax 10000 +# int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +#seedPerReadNmax 1000 +# int>0: max number of seeds per read + +#seedPerWindowNmax 50 +# int>0: max number of seeds per window + +#seedNoneLociPerWindow 10 +# int>0: max number of one seed loci per window + +alignIntronMin 11 +# minimum intron size: genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion +# modified specifically for tRNA introns + +#alignIntronMax 0 +# maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +#alignMatesGapMax 0 +# maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +#alignSJoverhangMin 5 +# int>0: minimum overhang (i.e. block size) for spliced alignments + +#alignSJDBoverhangMin 3 +# int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +#alignSplicedMateMapLmin 0 +# int>0: minimum mapped length for a read mate that is spliced + +#alignSplicedMateMapLminOverLmate 0.66 +# float>0: alignSplicedMateMapLmin normalized to mate length + +#alignWindowsPerReadNmax 10000 +# int>0: max number of windows per read + +#alignTranscriptsPerWindowNmax 100 +# int>0: max number of transcripts per window + +#alignTranscriptsPerReadNmax 10000 +# max number of different alignments per read to consider + +alignEndsType EndToEnd +# string: type of read ends alignment +# Local : standard local alignment with soft-clipping allowed +# EndToEnd: force end-to-end read alignment, do not soft-clip + +### SPLICE JUNCTIONS DATABASE PARAMETERS +#sjdbFileChrStartEnd - +# string: path to the file with genomic coordinates (chr start end strand) for the introns +# we will provide it in the command line directly, as the path is gonna depend on the sample +# NO: The annotations can be supplied in the form of splice junctions’ loci or GTF (or GFF3) file AND ipmortantly The annotations have to be supplied at the genome/suffix array generation step + +#sjdbGTFfile - +# string: path to the GTF file with annotations +# we will provide it in the command line directly + +#sjdbGTFchrPrefix - +# string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC geneomes) + +#sjdbGTFfeatureExon exon +# string: feature type in GTF file to be used as exons for building transcripts + +#sjdbGTFtagExonParentTranscript transcript_id +# string: tag name to be used as exons' parents for building transcripts + +#sjdbOverhang 0 +# int>=0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) +# if =0, splice junction database is not used + +#sjdbScore 2 +# int: extra alignment score for alignmets that cross database junctions + +### WINDOWS, ANCHORS, BINNING +#winAnchorMultimapNmax 50 +# int>0: max number of loci anchors are allowed to map to + +#winBinNbits 16 +# int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering,each window will occupy an integer nb of bins. + +#winAnchorDistNbins 9 +# int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +#winFlankNbins 4 +# int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + + +### CHIMERIC ALIGNMENTS +#chimSegmentMin 0 +## int>0: minimum length of chimeric segment length, if ==0, no chimeric output + +#chimScoreMin 0 +## int>0: minimum total (summed) score of the chimeric segments + +#chimScoreDropMax 20 +## int>0: max drop (difference) of chimeric score (the sum of scores of all chimeric segements) from the read length + +#chimScoreSeparation 10 +## int>0: minimum difference (separation) between the best chimeric score and the next one + +#chimScoreJunctionNonGTAG -1 +## int: penalty for a non-GT/AG chimeric junction + +#chimJunctionOverhangMin 20 +## int>0: minimum overhang for a chimeric junction + + +### QUANTIFICATION OF ANNOTATIONS +#quantMode - +# string(s): types of qunatification requested +# - : None +# TranscriptomeSAM : output SAM/BAM alignments to transcriptome into a separate file + + +#### 2-PASS +#twopass1readsN 0 +# int>0: number of reads to process for the 1st step. 0 : 1-step only, no 2nd pass; use very large number to map all reads in the first step +# +#twopassSJlimit 1000000 +# int>=0: maximum number of junction detected in the 1st step +# + diff --git a/parameter_dependencies.in b/parameter_dependencies.in new file mode 100644 index 0000000..42dd459 --- /dev/null +++ b/parameter_dependencies.in @@ -0,0 +1,25 @@ +# starIndex: provide the full path to the STAR index directory +# NetSeqPipeline will: +# 1) create a large index files for the selected reference genome +# 2) if index has been previously created: make sure it is available in the given starIndex location +# in a subdirectory with name of the reference +# example: for a reference genome e.g. GRCh38, make sure the file is located in /pathToStarIndex/GRCh38/ +# IMPORTANT: make sure you have write permission +# example: starIndex=/pathToStarIndex +starIndex= + +# parametersSTAR: provide the full path to the STAR parameter directory + name of the parameter file +# example: parametersSTAR=/pathToParameters/parameter_STAR.in +parametersSTAR= + +# starApp: provide the full path to the STAR application directory + name of the application executable +# example: starApp=/pathToStarApp/STAR +starApp= + +# samtools: provide the full path to the samtools application directory + name of the application executable +# example: samtools=/PathToSamtools/samtools +samtools= + +# bedtools: provide the full path to the bedtools application directory + name of the application executable +# example: bedtools=/PathToBedtools/bedtools +bedtools= diff --git a/parameter_template.in b/parameter_template.in new file mode 100644 index 0000000..a2b64cc --- /dev/null +++ b/parameter_template.in @@ -0,0 +1,31 @@ +# sampleID: necessary for naming the output files +# example: sampleID=AB01 +sampleID= + +# condition: necessary for naming the output files +# example: condition=treated +condition= + +# NetSeqApp: provide full path to the NeqSeqPipeline directory + name of the application executable +# example: NetSeqApp=/pathToFolderContainingPipeline/NetSeqPipeline +NetSeqApp= + +# pathToOutputFiles: provide the path to the requested output directory +# IMPORTANT: make sure you have write permission +# example: pathToOutputFiles=/outputDirectory +pathToOutputFiles= + +# files: provide full path + name of the fastq file(s) +# IMPORTANT: make sure you have read permission +# example: files=/pathToInputDirectory/samp1e.fq +files= + +# compression: provide compression method of the input file +# IMPORTANT REMARK: choose from bz2, gz, none +# example: compression=none +compression= + +# reference: provide reference genome for the mapping +# example: reference=GRCh38 +reference= + diff --git a/runNetSeqPipe.netPipe b/runNetSeqPipe.netPipe new file mode 100644 index 0000000..72fd1e7 --- /dev/null +++ b/runNetSeqPipe.netPipe @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# version v2: Annkatrin Bressin, Martyna Gajos +# script to run NetSeqPipeline +# provide parameters for the analysis in the files: parameter_template.in and parameter_dependencies.in + +source parameter_template.in +source parameter_dependencies.in + +mkdir -p $pathToOutputFiles + +bash $NetSeqApp -o $pathToOutputFiles -id $sampleID -f $files -c $compression -r $reference -si $starIndex -sa $starApp -st $samtools -bt $bedtools -p $parametersSTAR + diff --git a/source/appendCCAinFasta.r b/source/appendCCAinFasta.r new file mode 100755 index 0000000..9b892d7 --- /dev/null +++ b/source/appendCCAinFasta.r @@ -0,0 +1,18 @@ +suppressPackageStartupMessages(library(Biostrings)) + +args <- commandArgs(trailingOnly = TRUE) + +file=args[1] +outfile=args[2] + +data=readDNAStringSet(file) + +seq=paste(data) +names=names(data) + +seq=paste(seq,"CCA",sep="") + +data2=DNAStringSet(x=seq) +names(data2) <- names + +writeXStringSet(data2,filepath = outfile, format="fasta") diff --git a/source/coverage.sh b/source/coverage.sh new file mode 100755 index 0000000..379e577 --- /dev/null +++ b/source/coverage.sh @@ -0,0 +1,17 @@ +samtools=$1 +Bam=$2 +scriptDir=$3 +outName=$4 + +$samtools view -@20 $Bam | sed 's/\tjM.*$//' | python $scriptDir/source/customCoverage.py $outName + +tail -n +2 ${outName}stt_neg.bedGraph > ${outName}stt_neg.bedGraph.temp +tail -n +2 ${outName}stt_pos.bedGraph > ${outName}stt_pos.bedGraph.temp + +cp ${outName}stt_neg.bedGraph.temp ${outName}stt_neg.bedGraph +cp ${outName}stt_pos.bedGraph.temp ${outName}stt_pos.bedGraph + +# jM:B:c,M1,M2,... +# intron motifs for all junctions (i.e. N in CIGAR): +# 0: non-canonical; 1:GT/AG, 2:CT/AC, 3:GC/AG, 4:CT/GC, 5:AT/AC, 6:GT/AT. +# If splice junctions database is used, and a junction is annotated, 20 is added to its motif value diff --git a/source/customCoverage.py b/source/customCoverage.py new file mode 100755 index 0000000..099be4d --- /dev/null +++ b/source/customCoverage.py @@ -0,0 +1,32 @@ +#! /usr/bin/prun python-2 python + +""" +date : 17 april 2013 + +Author : Julia di Iulio + +Script used to monitor coverage (either the whole read, the start of the read (which represents PolII position), +or the end of the read) taking into account the number of times the read mapped (given by the NH:i: tag in the +alignment file): by giving a weight inversely proportional to the number of times the read mapped. +This script is highly inspired from http://www-huber.embl.de/users/anders/HTSeq/doc/tour.html + +use: python customCoverage.py stdin (alignment file in sam format) Experiment Name [1] + +""" + + +import sys, HTSeq + +stt = HTSeq.GenomicArray( "auto", stranded=True, typecode='i' ) #stt stands for start of reads + +alignment_file = HTSeq.SAM_Reader(sys.stdin) + +for algt in alignment_file: + if algt.aligned: + # extract the start position of the read as a genomic interval + stt_iv = HTSeq.GenomicInterval( algt.iv.chrom , algt.iv.start_d , algt.iv.start_d + 1, algt.iv.strand ) + # adds weighted coverage at the position corresponding to the start of the read + stt[ stt_iv ] += 1 + +stt.write_bedgraph_file( sys.argv[1] + "stt_neg.bedGraph", "+" ) # switch strands because 5' start of the reads correspond to the 3' end of the nascent RNA =^ pol2 position ??? +stt.write_bedgraph_file( sys.argv[1] + "stt_pos.bedGraph", "-" ) diff --git a/source/downloadData.sh b/source/downloadData.sh new file mode 100755 index 0000000..c4a0481 --- /dev/null +++ b/source/downloadData.sh @@ -0,0 +1,80 @@ +sequences=$1 +annotation=$2 +reference=$3 +scriptDir=$4 +info=$5 + +case $reference in + + GRCh37) + if [ ! -f $sequences/GRCh37.primary_assembly.genome.fa ];then + wget -O $sequences/GRCh37.primary_assembly.genome.fa.gz ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz + gunzip $sequences/GRCh37.primary_assembly.genome.fa.gz + fi + + if [ ! -f $annotation/gencode.GRCh37.primary_assembly.annotation.gtf ];then + wget -O $annotation/gencode.GRCh37.primary_assembly.annotation.gtf.gz \ + ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh37_mapping/gencode.v26lift37.annotation.gtf.gz + gunzip $annotation/gencode.GRCh37.primary_assembly.annotation.gtf.gz + fi + + if [ ! -f $sequences/GRCh37.tRNA.fa ];then + curl "http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi19/hg19-tRNAs.fa" > $sequences/GRCh37.tRNA.fa + fi + + if [ ! -f $sequences/GRCh37.tRNA.CCA.fa ];then + Rscript $scriptDir/source/appendCCAinFasta.r $sequences/GRCh37.tRNA.fa $sequences/GRCh37.tRNA.CCA.fa + fi + + GENOME=$sequences/GRCh37.primary_assembly.genome.fa + ANNOTATION=$annotation/gencode.GRCh37.primary_assembly.annotation.gtf + + if [ ! -f $sequences/rDNA.Genbank.U13369.1.fa ] ;then + curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=U13369.1&rettype=fasta" > $sequences/rDNA.Genbank.U13369.1.fa + fi + + if [ ! -f $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa ] ;then + curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=U67616.1&rettype=fasta" > $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa + fi + ;; + + GRCh38) + if [ ! -f $sequences/GRCh38.all.genome.fa ];then + wget -O $sequences/GRCh38.all.genome.fa.gz ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh38.p10.genome.fa.gz + gunzip $sequences/GRCh38.all.genome.fa.gz + fi + + if [ ! -f $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf ];then + wget -O $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf.gz \ + ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/gencode.v26.chr_patch_hapl_scaff.annotation.gtf.gz + gunzip $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf.gz + fi + + if [ ! -f $sequences/GRCh38.tRNA.fa ];then + curl "http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa" > $sequences/GRCh38.tRNA.fa + fi + + if [ ! -f $sequences/GRCh38.tRNA.CCA.fa ];then + Rscript $scriptDir/source/appendCCAinFasta.r $sequences/GRCh38.tRNA.fa $sequences/GRCh38.tRNA.CCA.fa + fi + + GENOME=$sequences/GRCh38.all.genome.fa + ANNOTATION=$annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf + + if [ ! -f $sequences/rDNA.Genbank.U13369.1.fa ] ;then + curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=U13369.1&rettype=fasta" > $sequences/rDNA.Genbank.U13369.1.fa + fi + + if [ ! -f $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa ] ;then + curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=U67616.1&rettype=fasta" > $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa + fi + ;; + + *) + status="### $reference isn't a valid input argument" + echo $status + exit 0 + ;; +esac + +echo -e "$GENOME\n$ANNOTATION\n$PREFIX" > $info diff --git a/source/extractMolecularBarcode.py b/source/extractMolecularBarcode.py new file mode 100755 index 0000000..56397a2 --- /dev/null +++ b/source/extractMolecularBarcode.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +""" +Date : April 24th 2014 + +Author : Julia di Iulio +Modified: Annkatrin Bressin + +script used to extract molecular barcodes from the reads (fastq format) but keeping them associated to the read. +Running the script will output : + a new fastq file (with the molecular barcode removed from the read sequence, but the information is kept in the + header of the read) + a file containing the molecular barcode counts + a file containing the ligation hexamer counts + +The two files containing the counts can be further used to investigate the distribution of the molecular barcodes and/or +the ligation hexamers (script not provided). + +use : python extractMolecularBarcode.py inFastq outFastq outBarcodes outLigation + +""" + +import sys, itertools + +iFastq=open(sys.argv[1], 'r') +oFastq=open(sys.argv[2], 'w') +oBarcode=open(sys.argv[3], 'w') +oLigation=open(sys.argv[4], 'w') +oLen=open(sys.argv[5], 'w') + + +dicoBarcode={} # creates a dictionnary that will contain Molecular Barcode counts (see below) +dicoLigation={} # creates a dictionnary that will contain ligation hexamer counts (see below) +nct='ACTGN' # nct stands for nucleotide + +# fill two dictionaries with the keys being all possible molecular barcodes/ligation hexamers made with the letters 'A', 'C', 'G', +# 'T' and 'N', and the values being set to 0 for now; the values will be incremented every time a molecular barcode/ligation hexamers +# is identified in a read +for barcode in list(itertools.product(nct, repeat=6)): + dicoBarcode["".join(barcode)] = 0 + dicoLigation["".join(barcode)] = 0 + + +header= iFastq.readline().rstrip() # reads the first line of the fastq file (which corresponds to the header of the first read) +while header != '': + totseq = iFastq.readline() # reads the sequence of the first read (the first 6 nucleotides being the molecular barcode (MBC)) + plus = iFastq.readline() # reads the 3rd line of a read in a fastq format + totqual = iFastq.readline() # reads the quality of the read (also containing the PHRED quality score of the MBC) + barcode = totseq[0:6] # assigns the 6 first nucleotide (nt) of the read sequence to the variable "barcode" + ligation = totseq[3:9] # assigns the last 3 nts of the MBC and the first 3 nts of the RNA fragment to the variable "ligation" + seq = totseq[6:] # assigns the RNA fragment sequence to the variable "seq" + qual = totqual[6:] # assigns the RNA fragment PHRED quality score to the variable "qual" + oFastq.write(header.split(" ")[0]+'_MolecularBarcode:'+barcode+' '+header.split(" ")[1]+'\n') + # writes the header of the reads (containing the MBC info) to the output fastq file + oFastq.write(seq) # writes the RNA fragment sequence to the output fastq file + oFastq.write(plus) # writes the 3rd line of a read in a fastq format to the output fastq file + oFastq.write(qual) # writes the RNA fragment PHRED quality score to the output fastq file + header= iFastq.readline().rstrip() # read the header of the following read + sequenceLen=len(seq)-1 + ## in case the user doesn't use STAR for the alignment, he/she may have used an adapter trimmer to remove the adapter at + ## the 3'end of the reads in which case and/or cleaned the reads from the 3'end... so that some sequences will be shorter + ## than 6 or 9 nts... in which case we can not extract the info of the MBC and/or the ligation. + ## In such case uncomment the 4 next lines and comment the following lines. + #if len(totseq) >= 6 : + # dicoBarcode[barcode] += 1 + #if len(totseq) >= 9 : + # dicoLigation[ligation] += 1 + dicoBarcode[barcode] += 1 # adds 1 count in the MBC dictionary to the specific MBC found in the read + dicoLigation[ligation] += 1 # adds 1 count in the ligation dictionary to the specific ligation found in the read + +for barcode, times in dicoBarcode.items(): # outputs a file containing each possible MBC (column 1) and the respective + oBarcode.write("%s\t%s\n" % (barcode, str(times))) # number of times (column 2) it was found in a read. + +for ligation, times in dicoLigation.items(): # outputs a file containing each possible ligation hexamer (column 1) and + oLigation.write("%s\t%s\n" % (ligation, str(times)))# the respective number of times (column 2) it was found in a read + +oLen.write("%s" % str(sequenceLen)) + +# close files +iFastq.close() +oFastq.close() +oBarcode.close() +oLigation.close() +oLen.close() + + + diff --git a/source/extractReadsWithMismatchesIn6FirstNct.py b/source/extractReadsWithMismatchesIn6FirstNct.py new file mode 100755 index 0000000..d5d1ffd --- /dev/null +++ b/source/extractReadsWithMismatchesIn6FirstNct.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +""" + +Date : May 9th 2014 + +Author : Julia di Iulio + +Script used to filter (from the alignment obtained with the reads that contained the molecular barcode in the beginning of +their sequence) reads that contain mismatches in the 6 first nucleotides (nts). The rationale is that it is extremely +unlikely (probability = 1/4e6) that the molecular barcode (which is a random hexamer sequence) corresponds exactly to the +genomic sequence next to where the rest of the read aligned to. So among the reads that aligned (even though they still +contained the molecular barcode in the beginning of their sequence), we will extract the ones that did not have any +mismatches in the 6 first nts. Indeed, those reads are most likely due to mispriming event from the reverse transcriptase +primer. We will then remove those reads from the alignment obtained with the reads that did not contain the molecular +barcode in the beginning of their sequence. + +use : python extractReadsWithMismatchesIn6FirstNct.py stdin stdout + +""" + +import sys, pysam, re + +iBAM = pysam.Samfile('-', 'r') # reads from the standard input +oBAM = pysam.Samfile('-', 'w', template=iBAM) # output to the standard output + +for line in iBAM: + md=re.findall(r'\d+', [tag[1] for tag in line.tags if tag[0]=='MD'][0]) # MD: string of Match and Mismatches positions, start after soft clipping (left most), + if len(md) == 1 : # if there are no mismatches + oBAM.write(line) # write the alignment into the output file + else: + if (not line.is_reverse) and (int(md[0]) >= 6): # if the first mismatch occurs after the 6th nt (from the 5' end) + oBAM.write(line) # write the alignment into the output file + elif (line.is_reverse) and (int(md[-1]) >= 6): # same as above but for reads that align to the reverse strand, last element because we are interested in the 5 ' end + oBAM.write(line) + +# close files +iBAM.close() +oBAM.close() + diff --git a/source/getSICoordinates1Based.r b/source/getSICoordinates1Based.r new file mode 100755 index 0000000..c2d22d2 --- /dev/null +++ b/source/getSICoordinates1Based.r @@ -0,0 +1,50 @@ +suppressPackageStartupMessages(library(data.table)) + +args <- commandArgs(trailingOnly = TRUE) + +file=args[1] +out=args[2] + +table=fread(file,sep="\t",header=F,fill=T) #stdin() +table=table[table$V3=="exon",] + +temp=unlist(strsplit(as.character(table$V9),split="; ")) +exon=temp[grepl("exon_number [1-9]+", temp, perl=T)] +table$V10=as.numeric(sub("exon_number ([1-9]+)", "\\1", exon, perl=T)) + +temp=unlist(strsplit(as.character(table$V9),split="; ")) +transcriptId=temp[grepl("transcript_id \"*\"", temp, perl=T)] +table$V11=as.character(sub("transcript_id \"*\"", "\\1", transcriptId, perl=T)) + +x=table[,.(exonNum=length(V10)),by=V11] + +table$V12=rep(x$exonNum,x$exonNum) + + +Result=c() + +#nrow(table) +for (i in 1:nrow(table)){ + if ((i%%100000)==0){ + print(i) + } + # 3' Exon positive strand + if (table[i,10]!=table[i,12] && table[i,7]=="+"){ + Result=c(Result,paste(table[i,1],"pos",table[i,5],sep="_")) + } + # 3' Intron positive strand + if (table[i,10]!=1 && table[i,7]=="+"){ + Result=c(Result,paste(table[i,1],"pos",as.numeric(table[i,4])-1,sep="_")) + } + # 3' Exon negative strand strand + if (table[i,10]!=table[i,12] && table[i,7]=="-"){ + Result=c(Result,paste(table[i,1],"neg",table[i,4],sep="_")) + } + # 3' Intron negative strand + if (table[i,10]!=1 && table[i,7]=="-"){ + Result=c(Result,paste(table[i,1],"neg",as.numeric(table[i,5])+1,sep="_")) + } +} + +write.table(unique(Result), col.names = F, row.names = F, quote=F, file=out) + diff --git a/source/joinBedGraph.r b/source/joinBedGraph.r new file mode 100755 index 0000000..a880e1b --- /dev/null +++ b/source/joinBedGraph.r @@ -0,0 +1,29 @@ +args <- commandArgs(trailingOnly = TRUE) +options(scipen=999) +suppressPackageStartupMessages(library(data.table)) + +bedGraph=args[1] + +bedGraph=fread(bedGraph, showProgress=F) +bedGraphResult=cbind(bedGraph[,c(1,2,3)],rowSums(bedGraph[,-c(1,2,3)])) + +colnames(bedGraphResult)=c("V1","V2","V3","V4") +bedGraphResult=bedGraphResult[bedGraphResult$V4!=0,] + +index=which((bedGraphResult$V3-bedGraphResult$V2)>1) +new=data.table(V1=character(),V2=numeric(),V3=numeric(),V4=numeric()) +for (idx in index){ + num=bedGraphResult$V3[idx]-bedGraphResult$V2[idx] + for (j in 0:(num-1)){ + new=rbind(new,data.table(bedGraphResult$V1[idx], + bedGraphResult$V2[idx]+j, + bedGraphResult$V2[idx]+j+1, + bedGraphResult$V4[idx])) + } +} + +bedGraphResult=rbind(bedGraphResult[-index,],new) + +write.table(bedGraphResult,row.names = F,col.names = F, quote = F, sep="\t") + + diff --git a/source/normalizeBedgraph.sh b/source/normalizeBedgraph.sh new file mode 100755 index 0000000..f74fb8b --- /dev/null +++ b/source/normalizeBedgraph.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +Files=$1 +result_dir=$2 +mapping_dir=$3 +scriptDir=$4 +normalized_dir=$5 +sampleID=$6 +reference=$7 + +alignedRef=0 +alignedSpikeIn=0 + +FILES=`echo $Files | tr ";" "\n"` + +for f in $FILES; +do + base=`basename $f` + name=${base%.*} + # number of reads + temp=`less $mapping_dir/$name/${name}_Aligned.aligned.stat|cut -f1` + alignedRefTemp=$((alignedRef + temp)) + alignedRef=$alignedRefTemp + + temp=`less $mapping_dir/$name/${name}_Aligned.aligned.stat|cut -f2` + alignedSpikeInTemp=$((alignedSpikeIn + temp)) + alignedSpikeIn=$alignedSpikeInTemp +done + +echo -e "$alignedRef\t$alignedSpikeIn" > $mapping_dir/Aligned.stat + +Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph $alignedSpikeIn \ +> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_spikeInNorm_${reference}_stt_pos.bedGraph + +Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph $alignedSpikeIn \ +> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_spikeInNorm_${reference}_stt_neg.bedGraph + +Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph $alignedRef \ +> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_tradNorm_${reference}_stt_pos.bedGraph + +Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph $alignedRef \ +> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_tradNorm_${reference}_stt_neg.bedGraph + diff --git a/source/normalizedBedgraph.r b/source/normalizedBedgraph.r new file mode 100755 index 0000000..ec98bfb --- /dev/null +++ b/source/normalizedBedgraph.r @@ -0,0 +1,13 @@ +args <- commandArgs(trailingOnly = TRUE) +suppressPackageStartupMessages(library(data.table)) + +bedgraph=args[1] +read_number=as.numeric(args[2]) + +Bedgraph=fread(bedgraph,fill=T,sep="\t") + +scaling=1/(read_number/1000000) + +Bedgraph$V4=round(Bedgraph$V4*scaling,2) + +write.table(Bedgraph,sep="\t",quote =F,col.names = F,row.names = F) diff --git a/source/removePCRduplicateStringent.py b/source/removePCRduplicateStringent.py new file mode 100755 index 0000000..f85f9a6 --- /dev/null +++ b/source/removePCRduplicateStringent.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +""" + +Date : May 2nd 2014 + +Author : Julia di Iulio + +script used to extract all reads starting at each position from a coverage file (only position with score > 0), to then +keep only 1 read per molecular barcode (to avoid getting signal from PCR duplicates). Ideally we should check the +distribution of the MB, but as it is unlikely to have very high coverage at a single position (due to the large size of the +human (un-)stable transcriptome), we are gonna chose the easy way and keep only 1 read per molecular barcode. +In the future, we could also take into consideration the insert size of the read, and the mismatches if any. + +use : python removePCRduplicateStringent.py inBedGraph1 (with start coverage) [1] + inTxt1 (file containing splicing intermediates (SI) positions with the folowing + nomenclature : chrX_strand_Start (where Start position are 1 based) [2] + inBAM1 (input BAM file with only unique alignment) [3] + outBedGraph1 (containing no SI, and only non duplicated reads) [4] + outBedGraph2 (containing only non duplicated reads, but still containing the SI if any) [5] + +""" + +import sys, pysam, os, numpy, re + + +iBG1 = open(sys.argv[1], 'r') +iTXT1 = set(open(sys.argv[2], 'r').readlines()) +iBAM1 = pysam.Samfile(sys.argv[3], 'rb') +oBG1 = open(sys.argv[4], 'w') +oBG2 = open(sys.argv[5], 'w') + +if '_pos.bedGraph' in sys.argv[1]: + std = 'pos' +elif '_neg.bedGraph' in sys.argv[1]: + std = 'neg' + +#iBG1.readline() # to skip the first line describing bedfile +count = 0 + +for bgLine in iBG1: + count += 1 + print(count) + chrom, st, ed , cov = bgLine.rstrip().split('\t') + if float(cov) > 0: + for j in range(0,(int(ed)-int(st))): + MB = set() + SI = 0 + woSI = 0 # without SI + wiSI = 0 # with SI + if chrom+"_"+std+"_"+str(int(st)+j+1)+"\n" in iTXT1: + SI = 1 + for line in iBAM1.fetch(chrom, int(st)+j, int(st)+j+1): + if (line.is_reverse and std=='pos' and line.aend-1 == int(st)) or \ + (not line.is_reverse and std=='neg' and line.pos == int(st)): + mb = line.qname.split('_MolecularBarcode:')[1] + if mb not in MB: + MB.add(mb) + if SI == 0: + woSI += 1 + wiSI += 1 + + if SI == 0 and woSI > 0: + oBG1.write("%s\t%s\t%s\t%i\n" % (chrom, str(int(st)+j), str(int(st)+j+1), woSI)) + if wiSI > 0: + oBG2.write("%s\t%s\t%s\t%i\n" % (chrom, str(int(st)+j), str(int(st)+j+1), wiSI)) + + +iBG1.close() +iBAM1.close() +oBG1.close() +oBG2.close() diff --git a/source/removePCRduplicates.sh b/source/removePCRduplicates.sh new file mode 100755 index 0000000..543cd9b --- /dev/null +++ b/source/removePCRduplicates.sh @@ -0,0 +1,150 @@ +cov_dir=$1 +bam_dir=$2 +scriptDir=$3 +SIfile=$4 +sampleID=$5 +samtools=$6 +idxDir=$7 +bedtools=$8 + +iBamUniq=$bam_dir/${sampleID}_noRTbias.sorted_uniq.bam +iCovUniqP=$cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph +iCovUniqN=$cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph + +oCovUniqP=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_pos.bedGraph +oCovUniqN=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_neg.bedGraph + +oCovUniqPnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph +oCovUniqNnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph + +bam_split=$bam_dir/Contigs +cov_split=$cov_dir/Contigs + +mkdir -p $cov_split +mkdir -p $bam_split +echo $idxDir +grep -P -v ".*tRNA.*" $idxDir/chrName.txt > $bam_split/chrom.txt + +while read contig; do + echo $contig + bamContig=$bam_split/${sampleID}_noRTbias.$contig.bam + + bedGraphContigP=$cov_split/${sampleID}_${contig}_pos.bedGraph + bedGraphContigPNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_pos.bedGraph + bedGraphContigPNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_pos.bedGraph + + bedGraphContigN=$cov_split/${sampleID}_${contig}_neg.bedGraph + bedGraphContigNNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_neg.bedGraph + bedGraphContigNNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_neg.bedGraph + + if [ ! -f $bamContig ];then + $samtools view -b $iBamUniq $contig > $bamContig + fi + if [ ! -f $bamContig.bai ];then + $samtools index $bamContig + fi + + if [ ! -f $bedGraphContigP ];then + grep -P "^$contig\t" $iCovUniqP > $bedGraphContigP + fi + + if [ ! -f $bedGraphContigN ];then + grep -P "^$contig\t" $iCovUniqN > $bedGraphContigN + fi + + if [ ! -f $bedGraphContigPNoPCR ] || [ ! -f $bedGraphContigPNoSINoPCRdup ];then + python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigP $SIfile $bamContig $bedGraphContigPNoSINoPCRdup $bedGraphContigPNoPCR + fi + + if [ ! -f $bedGraphContigNNoPCR ] || [ ! -f $bedGraphContigNNoSINoPCRdup ];then + python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigN $SIfile $bamContig $bedGraphContigNNoSINoPCRdup $bedGraphContigNNoPCR + fi + +done <$bam_split/chrom.txt +list=`grep -P ".*tRNA.*" $idxDir/chrName.txt | tr "\n" " "` +bamContig=$bam_split/${sampleID}_noRTbias.tRNA.bam + +bedGraphContigP=$cov_split/${sampleID}_tRNA_pos.bedGraph +bedGraphContigPNoPCR=$cov_split/${sampleID}_tRNA_noPCRdup_pos.bedGraph +bedGraphContigPNoSINoPCRdup=$cov_split/${sampleID}_tRNA_noPCRdup_noSI_pos.bedGraph + +bedGraphContigN=$cov_split/${sampleID}_tRNA_neg.bedGraph +bedGraphContigNNoPCR=$cov_split/${sampleID}_tRNA_noPCRdup_neg.bedGraph +bedGraphContigNNoSINoPCRdup=$cov_split/${sampleID}_tRNA_noPCRdup_noSI_neg.bedGraph + +if [ ! -f $bamContig ];then + $samtools view -b $iBamUniq $list > $bamContig +fi +if [ ! -f $bamContig.bai ];then + $samtools index $bamContig +fi + +if [ ! -f $bedGraphContigP ];then + grep -P "^.*tRNA.*\t" $iCovUniqP > $bedGraphContigP +fi + +if [ ! -f $bedGraphContigN ];then + grep -P "^.*tRNA.*\t" $iCovUniqN > $bedGraphContigN +fi + +if [ ! -f $bedGraphContigPNoPCR ] || [ ! -f $bedGraphContigPNoSINoPCRdup ];then + python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigP $SIfile $bamContig $bedGraphContigPNoSINoPCRdup $bedGraphContigPNoPCR +fi + +if [ ! -f $bedGraphContigNNoPCR ] || [ ! -f $bedGraphContigNNoSINoPCRdup ];then + python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigN $SIfile $bamContig $bedGraphContigNNoSINoPCRdup $bedGraphContigNNoPCR +fi + +oCovUniqP=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_pos.bedGraph +oCovUniqN=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_neg.bedGraph + +oCovUniqPnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph +oCovUniqNnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph + +echo "tRNA" | cat $bam_split/chrom.txt - > $bam_split/chrom_tRNA.txt + + +while read contig; do + echo $contig + bedGraphContigPNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_pos.bedGraph + bedGraphContigPNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_pos.bedGraph + + bedGraphContigNNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_neg.bedGraph + bedGraphContigNNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_neg.bedGraph + + if [ ! -f $oCovUniqP ] || [ ! -f $oCovUniqN ] || [ ! -f $oCovUniqPnoSI ] || [ ! -f $oCovUniqNnoSI ];then + rm -f $oCovUniqP + rm -f $oCovUniqN + + rm -f $oCovUniqPnoSI + rm -f $oCovUniqNnoSI + + cp $bedGraphContigPNoPCR $oCovUniqP + cp $bedGraphContigNNoPCR $oCovUniqN + + cp $bedGraphContigPNoSINoPCRdup $oCovUniqPnoSI + cp $bedGraphContigNNoSINoPCRdup $oCovUniqNnoSI + else + cat $oCovUniqPnoSI $bedGraphContigPNoSINoPCRdup > $oCovUniqPnoSI.temp + cat $oCovUniqNnoSI $bedGraphContigNNoSINoPCRdup > $oCovUniqNnoSI.temp + + cat $oCovUniqP $bedGraphContigPNoPCR > $oCovUniqP.temp + cat $oCovUniqN $bedGraphContigNNoPCR > $oCovUniqN.temp + + mv $oCovUniqPnoSI.temp $oCovUniqPnoSI + mv $oCovUniqNnoSI.temp $oCovUniqNnoSI + mv $oCovUniqP.temp $oCovUniqP + mv $oCovUniqN.temp $oCovUniqN + fi + +done < $bam_split/chrom_tRNA.txt + +$bedtools sort -i $oCovUniqPnoSI > $oCovUniqPnoSI.temp +$bedtools sort -i $oCovUniqNnoSI > $oCovUniqNnoSI.temp +$bedtools sort -i $oCovUniqP > $oCovUniqP.temp +$bedtools sort -i $oCovUniqN > $oCovUniqN.temp + +cp $oCovUniqPnoSI.temp $oCovUniqPnoSI +cp $oCovUniqNnoSI.temp $oCovUniqNnoSI +cp $oCovUniqP.temp $oCovUniqP +cp $oCovUniqN.temp $oCovUniqN diff --git a/source/rtRemoval.sh b/source/rtRemoval.sh new file mode 100755 index 0000000..2ccdbf9 --- /dev/null +++ b/source/rtRemoval.sh @@ -0,0 +1,52 @@ +bam=$1 +name=$2 +samtools=$3 +Bam1=$4 +Bam2=$5 +scriptDir=$6 + +$samtools view -H $Bam1 > $bam/headers.sam # extracting header of mapping without barcode + +# convert bam to sam and make barcode visible for mapping without barcode + +if [ ! -f $bam/temp.sam ];then # check if analysis already done + $samtools view -@ 20 $Bam1 | sed 's/_MolecularBarcode/\t_MB/' | sort -k1,1 > $bam/temp.sam +fi + +if [ ! -f $bam/rtBiasnames.txt ];then # check if analysis already done + $samtools view -@ 20 -h $Bam2 | python $scriptDir/source/extractReadsWithMismatchesIn6FirstNct.py | cut -f1 |sort -k1,1 | uniq \ + > $bam/rtBiasnames.txt +fi + +# remove hits from rtBiasnames # join read name and header again # compress to bam file after + +if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done + join -v 1 -1 1 -2 1 $bam/temp.sam \ + $bam/rtBiasnames.txt | sed -e 's/ _MB/_MolecularBarcode/' -e 's/ /\t/g' | \ + cat $bam/headers.sam - | \ + $samtools view -@ 20 -Sb -o $bam/temp_noRTbias.bam - +fi + +if [ ! -f $bam/${name}_noRTbias.sorted.bam ];then # check if analysis already done + $samtools sort -@ 20 -m 5G $bam/temp_noRTbias.bam \ + $bam/${name}_noRTbias.sorted +fi + +if [ ! -f $bam/${name}_noRTbias.sorted.bam.bai ];then # check if analysis already done + $samtools index $bam/${name}_noRTbias.sorted.bam +fi + +rm -f $bam/temp.sam +rm -f $bam/temp_noRTbias.bam + +if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done + $samtools view -@ 20 -b -h -q 50 $bam/${name}_noRTbias.sorted.bam \ + > $bam/${name}_noRTbias.sorted_uniq.bam +fi + +if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam.bai ];then # check if analysis already done + $samtools index $bam/${name}_noRTbias.sorted_uniq.bam +fi + + +