From 60a4f2bc4d2f130341f0554c79c8e9f7e8eb5800 Mon Sep 17 00:00:00 2001
From: Martyna Gajos <gajos@mgajos.molgen.mpg.de>
Date: Fri, 17 Nov 2017 15:53:55 +0100
Subject: [PATCH] net-seq-pipeline

---
 NetSeqPipeline                                | 494 ++++++++++++++++++
 README.md                                     |  28 +
 parameter_STAR.in                             | 480 +++++++++++++++++
 parameter_dependencies.in                     |  25 +
 parameter_template.in                         |  31 ++
 runNetSeqPipe.netPipe                         |  13 +
 source/appendCCAinFasta.r                     |  18 +
 source/coverage.sh                            |  17 +
 source/customCoverage.py                      |  32 ++
 source/downloadData.sh                        |  80 +++
 source/extractMolecularBarcode.py             |  87 +++
 .../extractReadsWithMismatchesIn6FirstNct.py  |  40 ++
 source/getSICoordinates1Based.r               |  50 ++
 source/joinBedGraph.r                         |  29 +
 source/normalizeBedgraph.sh                   |  43 ++
 source/normalizedBedgraph.r                   |  13 +
 source/removePCRduplicateStringent.py         |  72 +++
 source/removePCRduplicates.sh                 | 150 ++++++
 source/rtRemoval.sh                           |  52 ++
 19 files changed, 1754 insertions(+)
 create mode 100644 NetSeqPipeline
 create mode 100644 README.md
 create mode 100644 parameter_STAR.in
 create mode 100644 parameter_dependencies.in
 create mode 100644 parameter_template.in
 create mode 100644 runNetSeqPipe.netPipe
 create mode 100755 source/appendCCAinFasta.r
 create mode 100755 source/coverage.sh
 create mode 100755 source/customCoverage.py
 create mode 100755 source/downloadData.sh
 create mode 100755 source/extractMolecularBarcode.py
 create mode 100755 source/extractReadsWithMismatchesIn6FirstNct.py
 create mode 100755 source/getSICoordinates1Based.r
 create mode 100755 source/joinBedGraph.r
 create mode 100755 source/normalizeBedgraph.sh
 create mode 100755 source/normalizedBedgraph.r
 create mode 100755 source/removePCRduplicateStringent.py
 create mode 100755 source/removePCRduplicates.sh
 create mode 100755 source/rtRemoval.sh

diff --git a/NetSeqPipeline b/NetSeqPipeline
new file mode 100644
index 0000000..8c007c8
--- /dev/null
+++ b/NetSeqPipeline
@@ -0,0 +1,494 @@
+#!/usr/bin/env bash
+#
+# ./NETSeqPipeline -o PathToOutputFiles -f fastqfile1.fq fastqfile2.fq ... fastqfileX.fq
+#  -f  | --files [fqFile1.fq ... fqFileX.fq]: variable number of sequencing data in fastq format; NO DEFAULT
+#  -o  | --output [PathToOutputFiles]: relative or absolut output path; DEFAULT: current directory
+#  -r  | --reference [Assembly]: name of reference genome; DEFAULT:"GRCh38"
+#  -id | --sampleID [ID]: identification string; DEFAULT:"UNKNOWN"
+#
+#  -c  | --compression [Method]: either "bz2","gz", "none"; DEFAULT:"none"
+#  -p  | --parameterFile [Parameters.in]: STAR mapper parameter file; NO DEFAULT
+#
+#  -sa | --star [PathToStarApplication]: relative or absolut path to STAR mapper + name of executable; NO DEFAULT
+#  -si | --star-index [PathToStarIndices]: relative or absolut path to STAR indices, DEFAULT: PathToStarApplication "/index"
+#  -st | --samtools [PathToSamtoolsApplication]: relative or absolut path to samtool application + name of executable; NO DEFAULT
+#  -bt | --bedtools [PathToBadtoolsApplication]: relative or absolut path to samtool application + name of executable; NO DEFAULT
+#
+### requirements
+#
+# * python 2.7 with moduls pysam, HTSeq
+# * bzip2 or gzip if file is compressed, respectively
+# * STAR (2.5.3a)
+# * samtools
+# * bedtools
+# * wget
+# * curl
+#
+### remarks
+#
+# * make sure you have write rights on -o PathToOutputFiles
+# * make sure you have read rights on -f fastq files
+# * internet connection and write right in application file
+
+##############################################################################################################################
+##############################################################################################################################
+################################################  setting default values  ####################################################
+
+echo "################################# setting default values"
+
+workDir=$(pwd)
+outDir=$workDir
+absolutPath=$(readlink -f "$0")
+scriptDir=$(dirname "$absolutPath")
+compression=none
+reference=GRCh38
+star_index=$star/index
+sampleID=Unknown
+
+##############################################################################################################################
+##############################################################################################################################
+######################################  Read input arguments and set customer variables  #####################################
+
+echo "################################# read customer values"
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    -o|--output)
+    outDir="$2"
+    shift 2 # past argument
+    ;;
+
+    -id|--sampleID)
+    sampleID="$2"
+    shift 2
+    ;;
+
+    -c|--compression)
+    compression="$2"
+    shift 2
+    ;;
+
+    -sa|--star)
+    star="$2"
+    shift 2
+    ;;
+
+    -bt|--bedtools)
+    bedtools="$2"
+    shift 2
+    ;;
+
+    -st|--samtools)
+    samtools="$2"
+    shift 2
+    ;;
+
+    -p|--parameterFile)
+    pFile="$2"
+    shift 2
+    ;;
+
+    -si|--star-index)
+    star_index="$2"
+    shift 2
+    ;;
+
+    -r|--reference)
+    reference="$2"
+    shift 2
+    ;;
+
+    -f|--files)
+    numFiles=0
+    FILES=""
+    shift 1
+    while [[ $1 =~ .*\.fq || $1 =~ .*\.fastq ]]
+    do
+	if [[ $FILES != "" ]];
+        then 
+            FILES="$FILES\n"
+        fi	
+	FILES="$FILES$1"
+	numFiles=$((numFiles+1))
+	shift 1
+    done    
+    ;;
+    
+    *)
+    status="### $key isn't a valid input argument"
+    echo $status
+    exit 0
+    ;;
+   
+esac
+done
+
+# output current variables
+
+echo "# Sample identification = $sampleID"
+echo "# Number of input files = $numFiles"
+echo -e "# File paths = \n$FILES"
+echo "# Output = $outDir"
+echo "# Reference = $reference"
+echo "# Compression = $compression"
+echo "# STAR path = $star"
+echo "# STAR index = $star_index"
+echo "# Samtools = $samtools"
+
+
+FILES=`echo -e $FILES`
+mkdir -p $outDir
+
+echo "################################# Check if input file(s) exist ... "
+
+for f in $FILES
+do
+	if [ ! -f $f ];then
+		status="### $f doesn't exist"
+		echo $status
+    		exit 0
+	fi
+done
+
+##############################################################################################################################
+##############################################################################################################################
+######################################  Decompress files and save new filename in variable $FILES  ###########################
+
+echo "################################# Decompress Files ... "
+
+temp=$outDir/TemporaryData
+mkdir -p $temp
+
+if [ "$compression" != "none" ];then
+	FILES_TEMP=""	
+	for f in $FILES
+	do	
+		base=`basename $f`
+		name=${base%.*} # contain .fqTEMP
+		echo "# Decompress file "$f    		
+		fq=$temp/$name
+		case $f in
+		*.bz2)
+		# check if decompresed file already exist
+		if [ ! -f $fq ];then
+			echo "bunzip2:$f"			
+			bunzip2 -c $f -k > $fq
+		fi
+		;;	
+		*.gz)
+		# check if decompresed file already exist
+		if [ ! -f $fq ];then
+			echo "gunzip:$f"
+			gunzip -c $f -k > $fq
+		fi		
+		;;
+		*)
+		    status="### $f is already decompressed or compression type is not supportet"
+		    echo $status
+    		    exit 0
+   		    ;;
+		esac
+		if [[ $FILES_TEMP != "" ]];then 
+            		FILES_TEMP="$FILES_TEMP\n"
+        	fi	
+		FILES_TEMP="$FILES_TEMP${fq}"
+	done
+	# new filename(s) without compressed suffix in $FILES 
+	FILES=`echo -e $FILES_TEMP`	
+fi
+
+##############################################################################################################################
+##############################################################################################################################
+######################################################## Molecular Barcode Extraction #########################################
+# Description:
+#                       removal of 6 first nucleotides (molecular barcode) from FastQ files
+#                       creates 2 additional files containing the molecular barcode/ligation hexamer counts
+#
+# Program:
+#                       extractMolecularBarcode.py
+# 
+
+echo "################################# Extracting Molecular Barcode... "
+
+for f in $FILES
+do
+	base=`basename $f`
+	name=${base%.*}
+	reads=$temp/$name
+	echo "# Processing file $base"
+	# check if decompresed file without barcode already exist
+	if [ ! -f ${reads}_noBarcode.fastq ];then 
+		python $scriptDir/source/extractMolecularBarcode.py $f ${reads}_noBarcode.fastq ${reads}_barcodeDistribution.txt ${reads}_split_ligationDistribution.txt $outDir/lenReads.txt
+	fi
+done
+
+##############################################################################################################################
+##############################################################################################################################
+########################################################   Download Data   ###################################################
+# Description:		
+#		* download sequence and annotation data
+#		* generate info file for further handling in NETSeq pipeline
+#	Necessary for:
+#		1) generating genome index 
+# 		2) generating splicing intermediates
+# Program:
+#                       wget, curl
+#
+
+echo "################################# Download data ... "
+
+sequences=$scriptDir/data/sequences
+annotation=$scriptDir/data/annotation
+info=$scriptDir/data/info.stdout
+
+mkdir -p $scriptDir/data
+mkdir -p $sequences
+mkdir -p $annotation
+
+# ckeck if files exist, if not downloadData.sh will download data
+# REMARK: download of sequence and annotation data maybe fail if links are expired
+bash $scriptDir/source/downloadData.sh $sequences $annotation $reference $scriptDir $info
+
+GENOME=`sed '1q;d' $info`
+ANNOTATION=`sed '2q;d' $info`
+PREFIX=`sed '3q;d' $info`
+
+##############################################################################################################################
+##############################################################################################################################
+######################################################## Generating Genome Index  ############################################
+# Description:
+#                       STAR run building required genome indices ...
+#                       Required for mapping
+# Program:
+#                       STAR
+#
+
+echo "################################# Build genome index ... "
+
+idxDir=$star_index/${reference}
+mkdir -p $idxDir
+# check if index alredy exist
+if [ -z "$(ls -A $idxDir)" ]; then 	
+	$star --runThreadN 40 --runMode genomeGenerate --genomeDir $idxDir \
+	--genomeFastaFiles $GENOME $sequences/rDNA.Genbank.U13369.1.fa \
+	$sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa $sequences/${reference}.tRNA.CCA.fa
+fi
+
+
+##############################################################################################################################
+##############################################################################################################################
+######################################################## Alignment  ##########################################################
+# Description:
+#                       STAR run on reads not containing/containing molecular barcodes ...
+#                       Required for further Reverse Transcriptase (RT) bias removal
+#
+# Program:
+#                       STAR
+#
+
+echo "################################# Aligning reads containing or not Molecular Barcode ... "
+
+mapping_dir=$temp/StarMapping
+mkdir -p $mapping_dir
+
+for f in $FILES
+do
+	base=`basename $f`
+	name=${base%.*}
+	mapping=$mapping_dir/$name    
+	echo "# Mapping file "$base    
+	mkdir -p $mapping
+    # check if mapping is alredy done
+	if [ -z "$(ls -A $mapping)" ];then   
+		reads=$temp/$name
+		$star --genomeDir $idxDir --readFilesIn ${reads}_noBarcode.fastq --parametersFiles $pFile --runThreadN 20 --outFileNamePrefix $mapping/${name}_
+	fi
+
+	mkdir -p ${mapping}_withBC
+    # check if mapping is alredy done
+	if [ -z "$(ls -A ${mapping}_withBC)" ];then   
+	    $star --genomeDir $idxDir --readFilesIn $f --parametersFiles $pFile --runThreadN 20 --outFileNamePrefix ${mapping}_withBC/${name}_ 
+	fi
+done
+
+##############################################################################################################################
+##############################################################################################################################
+########################################## Reverse transcriptase mispriming event removal ####################################
+# Description:
+#               to remove artefactual reads (arising from RT)
+#               that are mapping even when we leave the barcode sequence
+# Program:
+#               samtools
+#               extractReadsWithMismatchesIn6FirstNct.py
+#
+
+echo "################################# Removing reads with RT bias ... "
+
+bam_dir=$temp/BamPostProcessing
+mkdir -p $bam_dir
+
+for f in $FILES
+do
+	base=`basename $f`
+	name=${base%.*}
+	mapping=$mapping_dir/$name    
+	echo "# Remove Reverse Transcription from file "$base
+	
+	Bam1=$mapping/${name}_Aligned.sortedByCoord.out.bam
+	Bam2=${mapping}_withBC/${name}_Aligned.sortedByCoord.out.bam
+    	
+	bam=$bam_dir/$name
+	mkdir -p $bam
+	
+	if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done
+		$scriptDir/source/rtRemoval.sh $bam $name $samtools $Bam1 $Bam2 $scriptDir
+	fi
+done
+
+##############################################################################################################################
+##############################################################################################################################
+########################################## Extracting Pol2 coverage track ####################################################
+# Description:
+#                       Monitors whole read coverage and pol II position
+#                       Generates bedgraph file
+#                       The *_pos.bedGraph files contain info from gene in the pos strand, and inversely for the *_neg.bedGraph files
+# Program:
+#                       samtools
+#                       customCoverage.py
+#
+
+
+echo "################################# Extracting Pol2 coverage track ... "
+
+cov_dir=$temp/CoverageTrack
+mkdir -p $cov_dir
+
+for f in $FILES
+do
+	base=`basename $f`
+	name=${base%.*}    
+	echo "# Processing file "$base
+	cov=$cov_dir/$name
+	mkdir -p $cov
+
+    uniqueBam=$bam_dir/$name/${name}_noRTbias.sorted_uniq.bam
+	
+	if [ ! -f $cov/${name}_uniq_noRTbias_stt_neg.bedGraph ] || [ ! -f $cov/${name}_uniq_noRTbias_stt_pos.bedGraph ];then # check if analysis already done
+		bash $scriptDir/source/coverage.sh $samtools $uniqueBam $scriptDir $cov/${name}_uniq_noRTbias_
+	fi
+done
+
+##############################################################################################################################
+##############################################################################################################################
+######################         Prepare bedGraph and bam files for the PCR duplication step     ##################################
+# Description:
+#              		generates bedGraph and bam files for the PCR duplication step
+# Program:
+#					joinBedGraph.r
+#					samtools
+#					bedtools           
+#
+
+echo "################################# Join bedGraph and bam files for the PCR duplication step ... "
+
+if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph ] || [ ! -f $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai ];then
+	if [ "$numFiles" -gt "1" ];then
+		BAMFILES=""
+		BEDGRAPHPOS=""
+		BEDGRAPHNEG=""		
+		for f in $FILES
+		do
+			base=`basename $f`
+			name=${base%.*}
+			BAMFILES="$BAMFILES $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam"
+			$bedtools sort -i $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph > $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph.temp
+			$bedtools sort -i $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph > $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph.temp
+			cp $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph.temp $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph
+			cp $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph.temp $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph
+			BEDGRAPHPOS="$BEDGRAPHPOS $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph"
+			BEDGRAPHNEG="$BEDGRAPHNEG $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph"
+		done		
+		
+		if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp ];then
+			$bedtools unionbedg -i $BEDGRAPHPOS > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp
+			$bedtools unionbedg -i $BEDGRAPHNEG > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp
+			Rscript $scriptDir/source/joinBedGraph.r $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph.temp > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph
+			Rscript $scriptDir/source/joinBedGraph.r $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph.temp > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph
+		fi	
+
+		# join bam file for PCR duplicate analysis
+		if [ ! -f $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai ];then
+			$samtools merge -f $bam_dir/${sampleID}_noRTbias.uniq.bam $BAMFILES
+			$samtools sort $bam_dir/${sampleID}_noRTbias.uniq.bam $bam_dir/${sampleID}_noRTbias.sorted_uniq
+			$samtools index $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam
+		fi
+	else
+		base=`basename $FILES`
+		echo $FILES
+		name=${base%.*}       
+		echo "# Processing File "$base
+
+		Rscript $scriptDir/source/joinBedGraph.r $cov_dir/$name/${name}_uniq_noRTbias_stt_pos.bedGraph > $cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph
+		Rscript $scriptDir/source/joinBedGraph.r $cov_dir/$name/${name}_uniq_noRTbias_stt_neg.bedGraph > $cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph
+		
+		cp $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam
+		cp $bam_dir/$name/${name}_noRTbias.sorted_uniq.bam.bai $bam_dir/${sampleID}_noRTbias.sorted_uniq.bam.bai
+	fi
+fi
+
+##############################################################################################################################
+##############################################################################################################################
+########################### Calculate Splicing Intermediates (SI) Coordinates ################################################
+# Description:
+#               extract splicing intermediate positions from annotation files           
+# Program:
+#               getSICoordinates1Based.r
+#               
+
+echo "################################# Calculate Splicing Intermediates (SI) Coordinates ... "
+
+SIfileReference=$annotation/SI_coordinates_${reference}_1based.txt
+
+if [ ! -f $SIfileReference ];then
+    bash Rscript $scriptDir/source/getSICoordinates1Based.r $ANNOTATION $SIfileReference
+fi
+
+#
+##############################################################################################################################
+##############################################################################################################################
+###################### Remove PCR duplicates and Splicing intermediates from Bam and BedGraph files ##########################
+# Description:
+#               to remove artefactual reads (arising from PCR duplicates)
+#               that have the same genomic position and molecular barcode than another read
+# Program:
+#               samtools
+#               removePCRduplicateStringent.py : a python script that extract reads arising from PCR duplication events
+#
+
+echo "################################# Removing PCR duplicates and Splicing intermediates from Bam and BedGraph files ... "
+
+echo "# Processing ${sampleID}"
+	
+if [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph ] || [ ! -f $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph ];then		
+	bash $scriptDir/source/removePCRduplicates.sh $cov_dir $bam_dir $scriptDir $SIfileReference $sampleID $samtools $idxDir $bedtools
+fi
+
+##############################################################################################################################
+##############################################################################################################################
+################### Split reference and filter chromosomal pol2 coverage #########################
+
+echo "################################# Split reference and filter chromosomal pol2 coverage ... "
+
+result_dir=$outDir/Bedgraph
+raw_dir=$result_dir/RawBedgraph
+
+mkdir -p $result_dir
+mkdir -p $raw_dir
+
+grep -P "^chr[0-9,X,Y]*\t" $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph | $bedtools sort -i - > $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph
+grep -P "^chr[0-9,X,Y]*\t" $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph | $bedtools sort -i - > $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph
+cp $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph $raw_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph
+cp $cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph $raw_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..93b2029
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+# NET-seq Pipeline
+
+## REQUIREMENTS
+- bedtools
+- curl
+- python 2.7, moduls:
+    - HTSeq
+    - pysam
+- R
+- samtools
+- STAR (2.5.3a)
+- wget
+- bzip2 or gzip if file is compressed, respectively
+
+## STEPS
+1. Molecular Barcode Extraction
+2. Reads mapping (both sets: with and without barcodes)
+3. Reverse transcriptase mispriming events removal
+4. Recording of the 5' end position (3' end position of the original nascent RNA)
+5. PCR duplicates removal
+6. Removal of sequencing reads due to splicing intermediates
+
+## USAGE
+1. Fill in the parameter files: parameter_dependencies.in and parameter_template.in
+2. Make the file runNetSeqPipe.netPipe executable
+    chmod -x runNetSeqPipe.netPipe
+3. Run the runNetSeqPipe.netPipe file
+    bash runNetSeqPipe.netPipe
diff --git a/parameter_STAR.in b/parameter_STAR.in
new file mode 100644
index 0000000..ff3f71b
--- /dev/null
+++ b/parameter_STAR.in
@@ -0,0 +1,480 @@
+#### all parameteres default are found at /groups/churchman/jd187/Program/STAR/STAR-STAR_2.4.0d/parametersDefault file ##############
+
+### versions
+#versionSTAR             020201
+#    int>0: STAR release numeric ID. Please do not change this value!
+#versionGenome           020101 020200
+#    int>0: oldest value of the Genome version compatible with this STAR release. Please do not change this value!
+
+### PARAMETERS
+#parametersFiles          -
+#    string: name of a user-defined parameters file, "-": none. Can only be defined on the command line.
+# provided at the command line level
+
+### RUN PARAMETERS
+#runMode                         alignReads
+#    string: type of the run:    alignReads      ... map reads
+#                                genomeGenerate  ... generate genome files
+#                                inputAlignmentsFromBAM    ... input alignments from BAM. Presently only works with --outWigType to generate wiggle files.
+
+#runThreadN                      1
+#    int: number of threads to run STAR
+# provided at the command line level in case we need to tweek the memory settings etc
+
+### GENOME PARAMETERS
+#genomeDir                   ./GenomeDir/
+#    string: path to the directory where genome files are stored (if runMode!=generateGenome) or will be generated (if runMode==generateGenome)
+# provided at the command line level, in case we wanna provide a diff genome (f.ex. built without Jct) without changing the param file
+
+#genomeLoad              NoSharedMemory
+#    mode of shared memory usage for the genome files
+#    string:        LoadAndKeep     ... load genome into shared and keep it in memory after run
+#                   LoadAndRemove   ... load genome into shared but remove it after run
+#                   LoadAndExit     ... load genome into shared memory and exit, keeping the genome in memory for future runs
+#                   Remove          ... do not map anything, just remove loaded genome from memory
+#                   NoSharedMemory  ... do not use shared memory, each job will have its own private copy of the genome
+
+### GENOME GENERATION PARAMETERS
+#genomeFastaFiles            -
+#    fasta files with genomic sequence for genome files generation, only used if runMode==genomeGenerate
+#    string(s): path(s) to the files from the working directory (separated by spaces)
+
+#genomeChrBinNbits           18
+#    int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins
+
+#genomeSAindexNbases         14
+#    int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches.
+
+#genomeSAsparseD             1
+#    int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction
+
+
+### INPUT FROM BAM
+#inputBAMfile                -
+#    string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM
+
+
+### READ PARAMETERS
+#readFilesIn                 Read1 Read2
+#    string(s): paths to files that contain input read1 (and, if needed,  read2)
+# given at the command line level as the read names is gonna depend on the sample name
+
+readFilesCommand        cat
+#    string(s): cmd line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout
+#               For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.
+
+#readMapNumber               -1
+#    int: number of reads to map from the beginning of the file
+#                            -1: map all reads
+
+#readMatesLengthsIn          NotEqual
+#    string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same  / not the same. NotEqual is safe in all situations.
+
+#clip3pNbases                 0
+#    int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.
+#
+#clip5pNbases                 0
+#    int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.
+
+clip3pAdapterSeq           ATCTCGTATGCCGTCTTCTGCTTG
+#    string(s): adapter sequences to clip from 3p of each mate.  If one value is given, it will be assumed the same for both mates.
+
+clip3pAdapterMMp            0.21
+#    double(s): max proportion of mismatches for 3p adpater clipping for each mate. If one value is given, it will be assumed the same for both mates.
+#
+
+clip3pAfterAdapterNbases    1
+#    int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates.
+
+
+
+### LIMITS
+#limitGenomeGenerateRAM               31000000000
+#    int>0: maximum available RAM (bytes) for genome generation
+
+limitIObufferSize                    200000000
+#    int>0: max available buffers size (bytes) for input/output, per thread
+
+#limitOutSAMoneReadBytes              100000
+#int>0: max size of the SAM record for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax)
+
+#limitOutSJoneRead                    1000
+#    int>0: max number of junctions for one read (including all multi-mappers)
+
+#limitOutSJcollapsed                  1000000
+#    int>0: max number of collapsed junctions
+
+limitBAMsortRAM                         64000000000
+#    int>=0: maximum available RAM for sorting BAM. If =0, it will be set to the genome index size
+
+
+### OUTPUT: GENERAL
+#outFileNamePrefix               ./
+#    string: output files name prefix (including full or relative path). Can only be defined on the command line.
+#   we define it on the command line as it is depending on the sample name
+
+#outTmpDir                       -
+#    string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed!
+#            - the temp directory will default to outFileNamePrefix_tmp
+
+#outStd                          Log
+#    string: which output will be directed to stdout (standard out)
+#      Log : log messages
+#      SAM : alignments in .sam format (which normally are output to Aligned.out.sam file), normal std output will go into Log.std.out
+#      BAM_Unsorted : alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted
+#      BAM_SortedByCoordinate : alignments in BAM format, unsorted. Requires --outSAMtype BAM SortedByCoordinate
+#      BAM_Quant : alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM
+
+outReadsUnmapped                Fastx
+#   string: output of unmapped reads (besides SAM)
+#                                None    : no output
+#                                Fastx   : output in separate fasta/fastq files, Unmapped.out.mate1/2
+
+#outQSconversionAdd              0
+#   int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31)
+
+
+### OUTPUT: SAM/BAM
+outSAMtype                      BAM SortedByCoordinate
+#    strings: type of SAM/BAM output
+#                                1st word:
+#                                BAM : output BAM without sorting
+#                                SAM : output SAM without sorting
+#                                None : no SAM/BAM output
+#                                2nd, 3rd ...
+#                                Unsorted : standard unsorted
+#                                SortedByCoordinate : sorted by coordinate
+
+#outSAMmode                      Full
+#    string: mode of SAM output  None : no SAM output
+#                                Full : full SAM output
+#                                NoQS : full SAM but without quality scores
+
+#outSAMstrandField                               None
+#    string: Cufflinks-like strand field flag    None        : not used
+#                                                intronMotif : strand derived from the intron motif. Reads with inconsisent and/or non-canonical introns are filtered out.
+
+#If you have stranded RNA-seq data, you do not need to use any specific STAR options. Instead, you need
+#to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -
+#library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has
+#to be used only for Cufflinks runs and not for STAR runs.
+
+outSAMattributes                All
+#    string: a string of desired SAM attributes, in the order desired for the output SAM
+#                                NH HI AS nM NM MD jM jI XS
+#                                Standard    : NH HI AS nM
+#                                All         : NH HI AS nM NM MD jM jI
+#                                None
+
+
+#outSAMunmapped          None
+#    string: output of unmapped reads in the SAM format
+#                                None    : no output
+#                                Within  : output unmapped reads within the main SAM file (i.e. Aligned.out.sam)
+
+#outSAMorder                     Paired
+#    string: type of sorting for the SAM output
+#                                Paired: one mate after the other for all paired alignments
+#                                PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files
+
+#outSAMprimaryFlag               OneBestScore
+#    string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG
+#                                OneBestScore  : only one alignment with the best score is primary
+#                                AllBestScore  : all alignments with the best score are primary
+
+#outSAMreadID                    Standard
+#    string: read ID record type
+#                                Standard  : first word (until space) from the FASTx read ID line, removing /1,/2 from the end
+#                                Number    : read number (index) in the FASTx file
+
+#outSAMmapqUnique        255
+#    int: 0 to 255: the MAPQ value for unique mappers
+
+#outSAMattrRGline        -
+#    string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:",
+#            e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z".
+#            xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.
+#            Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn.
+
+#outSAMheaderHD          -
+#    strings: @HD (header) line of the SAM header
+
+#outSAMheaderPG          -
+#    strings: extra @PG (software) line of the SAM header (in addition to STAR)
+#
+#outSAMheaderCommentFile -
+#    string: path to the file with @CO (comment) lines of the SAM header
+#
+#outBAMcompression   -1
+#    int: -1 ... 10  BAM compression level, -1=default compression, 0=no compression, 10=maximum compression
+#
+#bamRemoveDuplicatesType  -
+#    string: remove duplicates from BAM file, for now only works with sorted BAM feeded with inputBAMfile
+#           UniqueIdentical : removes all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical
+#
+#bamRemoveDuplicatesMate2basesN   0
+#    int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE)
+#
+
+### OUTPUT WIGGLE
+#outWigType          None
+#    string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p"
+#                    1st word:
+#                    None
+#                    bedGraph
+#                    wiggle
+#                    2nd word:
+#                    read1_5p : signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc
+#                    read2 : signal from only 2nd read
+#
+#outWigStrand        Stranded
+#    string: strandedness of wigglle (bedGraph) output
+#                    Stranded: separate strands, str1 and str2
+#                    Unstranded: collapsed strands
+#
+#outWigReferencesPrefix    -
+#    string: prefix matching reference names to include in the output wiggle file, e.g. "chr.*"
+#    default: "-" - include all references
+#
+#outWigNorm              RPM
+#    string: type of normalization for the signal
+#                        RPM : reads per million of mapped reads
+#                        None : no normalization, "raw" counts
+#
+
+
+### OUTPUT FILTERING
+#outFilterType           Normal
+#    string: type of filtering
+#                                Normal: standard filtering using only current alignment
+#                                BySJout: keep only those reads that contain junctions that passed filtering into SJ.out.tab
+
+#outFilterMultimapScoreRange     1
+#    int: the score range below the maximum score for multimapping alignments.
+# as -10*log10(1-1/2) = MapQual for read mapping twice = 3.013, 3.013 -range 3
+# keep all multi aligned reads (at least up to 100 times as -10*log10(1-1/100) =
+# 0.04364805)... I think... Not sure I got it right though...
+
+outFilterMultimapNmax   2
+#    int: read alignments will be output only if the read maps fewer than this value, otherwise no alignments will be output
+
+#outFilterMismatchNmax   10
+#    int: alignment will be output only if it has fewer mismatches
+
+#outFilterMismatchNoverLmax  0.3
+#    int: alignment will be output only if its ratio of mismatches to *mapped* length is less than this value; to make it consistent with parmater used with TH
+
+#outFilterMismatchNoverReadLmax  1
+#    int: alignment will be output only if its ratio of mismatches to *read* length is less than this value
+
+#outFilterScoreMin       0
+#    int: alignment will be output only if its score is higher than this value
+
+#outFilterScoreMinOverLread  0.66
+#        float: outFilterScoreMin normalized to read length (sum of mates' lengths for paired-end reads)
+
+#outFilterMatchNmin      0
+#    int: alignment will be output only if the number of matched bases is higher than this value; to make it consistent with parmater used with TH
+
+#outFilterMatchNminOverLread 0.66
+#    float: outFilterMatchNmin normalized to read length (sum of mates' lengths for paired-end reads)
+# I guess it has something to do with soft clipping ? I don't know, so I let the default value...
+
+#outFilterIntronMotifs   None
+#    string: filter alignment using their motifs
+#           None : no filtering
+#           RemoveNoncanonical : filter out alignments that contain non-canonical junctions
+#           RemoveNoncanonicalUnannotated : filter out alignments that contain non-canonical unannotated junctions when using annotated
+#                                           splice junctions database. The annotated non-canonical junctions will be kept.
+
+### OUTPUT FILTERING: SPLICE JUNCTIONS
+#outSJfilterReads        All
+#    string: which reads to consider for collapsed splice junctions output
+#                All: all reads, unique- and multi-mappers
+#                Unique: uniquely mapping reads only
+
+outSJfilterOverhangMin  3  1  1  1
+#    4*int:    minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG motif,
+#                       (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif does not apply to annotated junctions
+
+#outSJfilterCountUniqueMin   3   1   1   1
+#    4*int: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif,
+#           (4) AT/AC motif. -1 means no output for that motif. Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions
+
+#outSJfilterCountTotalMin    3   1   1   1
+#    4*int: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions
+
+outSJfilterDistToOtherSJmin 0  0   0   0
+#    4*int>=0: minimum allowed distance to other junctions' donor/acceptor
+#                                does not apply to annotated junctions
+
+#outSJfilterIntronMaxVsReadN        50000 100000 200000
+#    N*int>=0: maximum gap allowed for junctions supported by 1,2,3...N reads
+#              i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4
+#              reads any gap <=alignIntronMax does not apply to annotated junctions
+
+### SCORING
+#scoreGap                     0
+#    gap open penalty
+
+#scoreGapNoncan               -8
+#    non-canonical gap open penalty (in addition to scoreGap)
+
+#scoreGapGCAG                 -4
+#    GCAG gap open penalty (in addition to scoreGap)
+
+#scoreGapATAC                 -8
+#    ATAC gap open penalty  (in addition to scoreGap)
+
+#scoreGenomicLengthLog2scale   -0.25
+#    extra score logarithmically scaled with genomic length of the alignment: -scoreGenomicLengthLog2scale*log2(genomicLength)
+
+#scoreDelOpen                 -2
+#    deletion open penalty
+
+#scoreDelBase                 -2
+#    deletion extension penalty per base (in addition to scoreDelOpen)
+
+#scoreInsOpen                 -2
+#    insertion open penalty
+
+#scoreInsBase                 -2
+#    insertion extension penalty per base (in addition to scoreInsOpen)
+
+#scoreStitchSJshift           1
+#    maximum score reduction while searching for SJ boundaries inthe stitching step
+
+
+### ALIGNMENT and SEED PARAMETERS
+#seedSearchStartLmax          50
+#    int>0: defines the search start point through the read - the read is split into pieces no longer than this value
+
+#seedSearchStartLmaxOverLread    1.0
+#    float: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)
+
+#seedSearchLmax          0
+#    int>=0: defines the maximum length of the seeds, if =0 max seed lengthis infinite
+
+#seedMultimapNmax        10000
+#    int>0: only pieces that map fewer than this value are utilized in the stitching procedure
+
+#seedPerReadNmax         1000
+#    int>0: max number of seeds per read
+
+#seedPerWindowNmax       50
+#    int>0: max number of seeds per window
+
+#seedNoneLociPerWindow   10
+#    int>0: max number of one seed loci per window
+
+alignIntronMin          11
+#    minimum intron size: genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion
+# modified specifically for tRNA introns
+
+#alignIntronMax          0
+#    maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins
+
+#alignMatesGapMax            0
+#    maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins
+
+#alignSJoverhangMin          5
+#    int>0: minimum overhang (i.e. block size) for spliced alignments
+
+#alignSJDBoverhangMin        3
+#    int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments
+
+#alignSplicedMateMapLmin     0
+#    int>0: minimum mapped length for a read mate that is spliced
+
+#alignSplicedMateMapLminOverLmate 0.66
+#    float>0: alignSplicedMateMapLmin normalized to mate length
+
+#alignWindowsPerReadNmax     10000
+#    int>0: max number of windows per read
+
+#alignTranscriptsPerWindowNmax   100
+#    int>0: max number of transcripts per window
+
+#alignTranscriptsPerReadNmax     10000
+#    max number of different alignments per read to consider
+
+alignEndsType           EndToEnd
+#    string: type of read ends alignment
+#                        Local   : standard local alignment with soft-clipping allowed
+#                        EndToEnd: force end-to-end read alignment, do not soft-clip
+
+### SPLICE JUNCTIONS DATABASE PARAMETERS
+#sjdbFileChrStartEnd                     -
+#    string: path to the file with genomic coordinates (chr <tab> start <tab> end <tab> strand) for the introns
+# we will provide it in the command line directly, as the  path is gonna depend on the sample
+# NO: The annotations can be supplied in the form of splice junctions’ loci or GTF (or GFF3) file AND ipmortantly The annotations have to be supplied at the genome/suffix array generation step
+
+#sjdbGTFfile                             -
+#    string: path to the GTF file with annotations
+# we will provide it in the command line directly
+
+#sjdbGTFchrPrefix                        -
+#    string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC geneomes)
+
+#sjdbGTFfeatureExon                      exon
+#    string: feature type in GTF file to be used as exons for building transcripts
+
+#sjdbGTFtagExonParentTranscript          transcript_id
+#    string: tag name to be used as exons' parents for building transcripts
+
+#sjdbOverhang                            0
+#    int>=0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)
+#            if =0, splice junction database is not used
+
+#sjdbScore                               2
+#    int: extra alignment score for alignmets that cross database junctions
+
+### WINDOWS, ANCHORS, BINNING
+#winAnchorMultimapNmax           50
+#    int>0: max number of loci anchors are allowed to map to
+
+#winBinNbits                     16
+#    int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering,each window will occupy an integer nb of bins.
+
+#winAnchorDistNbins              9
+#    int>0: max number of bins between two anchors that allows aggregation of anchors into one window
+
+#winFlankNbins               4
+#    int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window
+
+
+### CHIMERIC ALIGNMENTS
+#chimSegmentMin              0
+##    int>0: minimum length of chimeric segment length, if ==0, no chimeric output
+
+#chimScoreMin                0
+##    int>0: minimum total (summed) score of the chimeric segments
+
+#chimScoreDropMax            20
+##    int>0: max drop (difference) of chimeric score (the sum of scores of all chimeric segements) from the read length
+
+#chimScoreSeparation         10
+##    int>0: minimum difference (separation) between the best chimeric score and the next one
+
+#chimScoreJunctionNonGTAG    -1
+##    int: penalty for a non-GT/AG chimeric junction
+
+#chimJunctionOverhangMin     20
+##    int>0: minimum overhang for a chimeric junction
+
+
+### QUANTIFICATION OF ANNOTATIONS
+#quantMode                   -
+#    string(s): types of qunatification requested
+#                            - : None
+#                            TranscriptomeSAM : output SAM/BAM alignments to transcriptome into a separate file
+
+
+#### 2-PASS
+#twopass1readsN              0
+#    int>0: number of reads to process for the 1st step. 0 : 1-step only, no 2nd pass; use very large number to map all reads in the first step
+#
+#twopassSJlimit              1000000
+#    int>=0: maximum number of junction detected in the 1st step
+#
+
diff --git a/parameter_dependencies.in b/parameter_dependencies.in
new file mode 100644
index 0000000..42dd459
--- /dev/null
+++ b/parameter_dependencies.in
@@ -0,0 +1,25 @@
+# starIndex: provide the full path to the STAR index directory
+# NetSeqPipeline will:
+#		1) create a large index files for the selected reference genome
+#       2) if index has been previously created: make sure it is available in the given starIndex location 
+#			in a subdirectory with name of the reference
+#			example: for a reference genome e.g. GRCh38, make sure the file is located in /pathToStarIndex/GRCh38/
+# IMPORTANT: make sure you have write permission
+# example: starIndex=/pathToStarIndex
+starIndex=
+
+# parametersSTAR: provide the full path to the STAR parameter directory + name of the parameter file
+# example: parametersSTAR=/pathToParameters/parameter_STAR.in
+parametersSTAR=
+
+# starApp: provide the full path to the STAR application directory + name of the application executable
+# example: starApp=/pathToStarApp/STAR
+starApp=
+
+# samtools: provide the full path to the samtools application directory + name of the application executable
+# example: samtools=/PathToSamtools/samtools
+samtools=
+
+# bedtools: provide the full path to the bedtools application directory + name of the application executable
+# example: bedtools=/PathToBedtools/bedtools
+bedtools=
diff --git a/parameter_template.in b/parameter_template.in
new file mode 100644
index 0000000..a2b64cc
--- /dev/null
+++ b/parameter_template.in
@@ -0,0 +1,31 @@
+# sampleID: necessary for naming the output files
+# example: sampleID=AB01 
+sampleID=
+
+# condition: necessary for naming the output files
+# example: condition=treated
+condition=
+
+# NetSeqApp: provide full path to the NeqSeqPipeline directory + name of the application executable
+# example: NetSeqApp=/pathToFolderContainingPipeline/NetSeqPipeline
+NetSeqApp=
+
+# pathToOutputFiles: provide the path to the requested output directory
+# IMPORTANT: make sure you have write permission
+# example: pathToOutputFiles=/outputDirectory 
+pathToOutputFiles=
+
+# files: provide full path + name of the fastq file(s)
+# IMPORTANT: make sure you have read permission
+# example: files=/pathToInputDirectory/samp1e.fq 
+files=
+
+# compression: provide compression method of the input file
+# IMPORTANT REMARK: choose from bz2, gz, none
+# example: compression=none
+compression=
+
+# reference: provide reference genome for the mapping
+# example: reference=GRCh38
+reference=
+
diff --git a/runNetSeqPipe.netPipe b/runNetSeqPipe.netPipe
new file mode 100644
index 0000000..72fd1e7
--- /dev/null
+++ b/runNetSeqPipe.netPipe
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+# version v2: Annkatrin Bressin, Martyna Gajos
+# script to run NetSeqPipeline
+# provide parameters for the analysis in the files: parameter_template.in and parameter_dependencies.in
+
+source parameter_template.in
+source parameter_dependencies.in
+
+mkdir -p $pathToOutputFiles
+
+bash $NetSeqApp -o $pathToOutputFiles -id $sampleID -f $files -c $compression -r $reference -si $starIndex -sa $starApp -st $samtools -bt $bedtools -p $parametersSTAR
+
diff --git a/source/appendCCAinFasta.r b/source/appendCCAinFasta.r
new file mode 100755
index 0000000..9b892d7
--- /dev/null
+++ b/source/appendCCAinFasta.r
@@ -0,0 +1,18 @@
+suppressPackageStartupMessages(library(Biostrings))
+
+args <- commandArgs(trailingOnly = TRUE) 
+
+file=args[1]
+outfile=args[2]
+
+data=readDNAStringSet(file)
+
+seq=paste(data)
+names=names(data)
+
+seq=paste(seq,"CCA",sep="")
+
+data2=DNAStringSet(x=seq)
+names(data2) <- names
+
+writeXStringSet(data2,filepath = outfile, format="fasta")
diff --git a/source/coverage.sh b/source/coverage.sh
new file mode 100755
index 0000000..379e577
--- /dev/null
+++ b/source/coverage.sh
@@ -0,0 +1,17 @@
+samtools=$1
+Bam=$2
+scriptDir=$3
+outName=$4
+
+$samtools view -@20 $Bam | sed 's/\tjM.*$//' | python $scriptDir/source/customCoverage.py $outName
+
+tail -n +2 ${outName}stt_neg.bedGraph > ${outName}stt_neg.bedGraph.temp
+tail -n +2 ${outName}stt_pos.bedGraph > ${outName}stt_pos.bedGraph.temp
+
+cp ${outName}stt_neg.bedGraph.temp ${outName}stt_neg.bedGraph
+cp ${outName}stt_pos.bedGraph.temp ${outName}stt_pos.bedGraph
+
+# jM:B:c,M1,M2,...
+# intron motifs for all junctions (i.e. N in CIGAR): 
+# 0: non-canonical; 1:GT/AG, 2:CT/AC, 3:GC/AG, 4:CT/GC, 5:AT/AC, 6:GT/AT.
+# If splice junctions database is used, and a junction is annotated, 20 is added to its motif value 
diff --git a/source/customCoverage.py b/source/customCoverage.py
new file mode 100755
index 0000000..099be4d
--- /dev/null
+++ b/source/customCoverage.py
@@ -0,0 +1,32 @@
+#! /usr/bin/prun python-2 python
+
+"""
+date : 17 april 2013
+
+Author : Julia di Iulio
+
+Script used to monitor coverage (either the whole read, the start of the read (which represents PolII position),
+or the end of the read) taking into account the number of times the read mapped (given by the NH:i: tag in the
+alignment file): by giving a weight inversely proportional to the number of times the read mapped.
+This script is highly inspired from http://www-huber.embl.de/users/anders/HTSeq/doc/tour.html
+
+use: python customCoverage.py stdin (alignment file in sam format) Experiment Name [1]
+
+"""
+
+
+import sys, HTSeq
+
+stt = HTSeq.GenomicArray( "auto", stranded=True, typecode='i' ) #stt stands for start of reads
+
+alignment_file = HTSeq.SAM_Reader(sys.stdin)
+
+for algt in alignment_file: 
+    if algt.aligned:    
+        # extract the start position of the read as a genomic interval
+        stt_iv = HTSeq.GenomicInterval( algt.iv.chrom , algt.iv.start_d , algt.iv.start_d + 1, algt.iv.strand )
+        # adds weighted coverage at the position corresponding to the start of the read
+        stt[ stt_iv ] += 1
+
+stt.write_bedgraph_file( sys.argv[1] + "stt_neg.bedGraph", "+" ) # switch strands because 5' start of the reads correspond to the 3' end of the nascent RNA =^ pol2 position ???
+stt.write_bedgraph_file( sys.argv[1] + "stt_pos.bedGraph", "-" )
diff --git a/source/downloadData.sh b/source/downloadData.sh
new file mode 100755
index 0000000..c4a0481
--- /dev/null
+++ b/source/downloadData.sh
@@ -0,0 +1,80 @@
+sequences=$1
+annotation=$2
+reference=$3
+scriptDir=$4
+info=$5
+
+case $reference in
+
+	GRCh37)		
+		if [ ! -f $sequences/GRCh37.primary_assembly.genome.fa ];then
+			wget -O $sequences/GRCh37.primary_assembly.genome.fa.gz ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz
+			gunzip $sequences/GRCh37.primary_assembly.genome.fa.gz			
+		fi
+
+		if [ ! -f $annotation/gencode.GRCh37.primary_assembly.annotation.gtf ];then
+			wget -O $annotation/gencode.GRCh37.primary_assembly.annotation.gtf.gz \
+			ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh37_mapping/gencode.v26lift37.annotation.gtf.gz
+			gunzip $annotation/gencode.GRCh37.primary_assembly.annotation.gtf.gz			
+		fi
+		
+		if [ ! -f $sequences/GRCh37.tRNA.fa ];then
+			curl "http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi19/hg19-tRNAs.fa" > $sequences/GRCh37.tRNA.fa
+		fi
+
+		if [ ! -f $sequences/GRCh37.tRNA.CCA.fa ];then
+			Rscript $scriptDir/source/appendCCAinFasta.r $sequences/GRCh37.tRNA.fa $sequences/GRCh37.tRNA.CCA.fa		
+		fi		
+	
+		GENOME=$sequences/GRCh37.primary_assembly.genome.fa
+		ANNOTATION=$annotation/gencode.GRCh37.primary_assembly.annotation.gtf
+
+		if [ ! -f $sequences/rDNA.Genbank.U13369.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U13369.1&amp;rettype=fasta" > $sequences/rDNA.Genbank.U13369.1.fa
+		fi
+
+		if [ ! -f $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U67616.1&amp;rettype=fasta" > $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa
+		fi		
+	;;
+
+	GRCh38)
+		if [ ! -f $sequences/GRCh38.all.genome.fa ];then
+			wget -O $sequences/GRCh38.all.genome.fa.gz ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh38.p10.genome.fa.gz
+			gunzip $sequences/GRCh38.all.genome.fa.gz		
+		fi
+
+		if [ ! -f $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf ];then
+			wget -O $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf.gz \
+			ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/gencode.v26.chr_patch_hapl_scaff.annotation.gtf.gz
+			gunzip $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf.gz		
+		fi
+
+		if [ ! -f $sequences/GRCh38.tRNA.fa ];then
+			curl "http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa" > $sequences/GRCh38.tRNA.fa
+		fi
+		
+		if [ ! -f $sequences/GRCh38.tRNA.CCA.fa ];then
+			Rscript $scriptDir/source/appendCCAinFasta.r $sequences/GRCh38.tRNA.fa $sequences/GRCh38.tRNA.CCA.fa		
+		fi
+
+		GENOME=$sequences/GRCh38.all.genome.fa
+		ANNOTATION=$annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf
+
+		if [ ! -f $sequences/rDNA.Genbank.U13369.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U13369.1&amp;rettype=fasta" > $sequences/rDNA.Genbank.U13369.1.fa
+		fi
+
+		if [ ! -f $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U67616.1&amp;rettype=fasta" > $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa
+		fi
+	;;
+		
+	*)
+		status="### $reference isn't a valid input argument"
+		echo $status
+		exit 0
+		;;
+esac
+
+echo -e "$GENOME\n$ANNOTATION\n$PREFIX" > $info
diff --git a/source/extractMolecularBarcode.py b/source/extractMolecularBarcode.py
new file mode 100755
index 0000000..56397a2
--- /dev/null
+++ b/source/extractMolecularBarcode.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+
+"""
+Date : April 24th 2014
+
+Author : Julia di Iulio
+Modified: Annkatrin Bressin 
+
+script used to extract molecular barcodes from the reads (fastq format) but keeping them associated to the read.
+Running the script will output :
+        a new fastq file (with the molecular barcode removed from the read sequence, but the information is kept in the
+        header of the read)
+        a file containing the molecular barcode counts
+        a file containing the ligation hexamer counts
+
+The two files containing the counts can be further used to investigate the distribution of the molecular barcodes and/or
+the ligation hexamers (script not provided).
+
+use : python extractMolecularBarcode.py inFastq outFastq outBarcodes outLigation
+
+"""
+
+import sys, itertools
+
+iFastq=open(sys.argv[1], 'r')
+oFastq=open(sys.argv[2], 'w')
+oBarcode=open(sys.argv[3], 'w')
+oLigation=open(sys.argv[4], 'w')
+oLen=open(sys.argv[5], 'w')
+
+
+dicoBarcode={}  # creates a dictionnary that will contain Molecular Barcode counts (see below)
+dicoLigation={} # creates a dictionnary that will contain ligation hexamer counts (see below)
+nct='ACTGN'     # nct stands for nucleotide
+
+# fill two dictionaries with the keys being all possible molecular barcodes/ligation hexamers made with the letters 'A', 'C', 'G',
+# 'T' and 'N', and the values being set to 0 for now; the values will be incremented every time a molecular barcode/ligation hexamers
+# is identified in a read
+for barcode in list(itertools.product(nct, repeat=6)):
+    dicoBarcode["".join(barcode)] = 0
+    dicoLigation["".join(barcode)] = 0
+
+
+header= iFastq.readline().rstrip() # reads the first line of the fastq file (which corresponds to the header of the first read)
+while header != '':
+    totseq   = iFastq.readline() # reads the sequence of the first read (the first 6 nucleotides being the molecular barcode (MBC))
+    plus     = iFastq.readline() # reads the 3rd line of a read in a fastq format
+    totqual  = iFastq.readline() # reads the quality of the read (also containing the PHRED quality score of the MBC)
+    barcode  = totseq[0:6]    # assigns the 6 first nucleotide (nt) of the read sequence to the variable "barcode"
+    ligation = totseq[3:9]    # assigns the last 3 nts of the MBC and the first 3 nts of the RNA fragment to the variable "ligation"
+    seq  = totseq[6:]         # assigns the RNA fragment sequence to the variable "seq"
+    qual = totqual[6:]        # assigns the RNA fragment PHRED quality score to the variable "qual"
+    oFastq.write(header.split(" ")[0]+'_MolecularBarcode:'+barcode+' '+header.split(" ")[1]+'\n')
+    # writes the header of the reads (containing the MBC info) to the output fastq file
+    oFastq.write(seq)  # writes the RNA fragment sequence to the output fastq file
+    oFastq.write(plus) # writes the 3rd line of a read in a fastq format to the output fastq file
+    oFastq.write(qual) # writes the RNA fragment PHRED quality score to the output fastq file
+    header= iFastq.readline().rstrip() # read the header of the following read
+    sequenceLen=len(seq)-1
+    ## in case the user doesn't use STAR for the alignment, he/she may have used an adapter trimmer to remove the adapter at
+    ## the 3'end of the reads in which case and/or cleaned the reads from the 3'end... so that some sequences will be shorter
+    ## than 6 or 9 nts... in which case we can not extract the info of the MBC and/or the ligation.
+    ## In such case uncomment the 4 next lines and comment the following lines.
+    #if len(totseq) >= 6 :
+    #    dicoBarcode[barcode] += 1
+    #if len(totseq) >= 9 :
+    #    dicoLigation[ligation] += 1
+    dicoBarcode[barcode] += 1   # adds 1 count in the MBC dictionary to the specific MBC found in the read
+    dicoLigation[ligation] += 1 # adds 1 count in the ligation dictionary to the specific ligation found in the read
+
+for barcode, times in dicoBarcode.items():              # outputs a file containing each possible MBC (column 1) and the respective
+    oBarcode.write("%s\t%s\n" % (barcode, str(times)))  # number of times (column 2) it was found in a read.
+
+for ligation, times in dicoLigation.items():            # outputs a file containing each possible ligation hexamer (column 1) and
+    oLigation.write("%s\t%s\n" % (ligation, str(times)))# the respective number of times (column 2) it was found in a read
+
+oLen.write("%s" % str(sequenceLen))
+
+# close files
+iFastq.close()
+oFastq.close()
+oBarcode.close()
+oLigation.close()
+oLen.close()
+
+
+
diff --git a/source/extractReadsWithMismatchesIn6FirstNct.py b/source/extractReadsWithMismatchesIn6FirstNct.py
new file mode 100755
index 0000000..d5d1ffd
--- /dev/null
+++ b/source/extractReadsWithMismatchesIn6FirstNct.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+"""
+
+Date : May 9th 2014
+
+Author : Julia di Iulio
+
+Script used to filter (from the alignment obtained with the reads that contained the molecular barcode in the beginning of
+their sequence) reads that contain mismatches in the 6 first nucleotides (nts). The rationale is that it is extremely
+unlikely (probability = 1/4e6) that the molecular barcode (which is a random hexamer sequence) corresponds exactly to the
+genomic sequence next to where the rest of the read aligned to. So among the reads that aligned (even though they still
+contained the molecular barcode in the beginning of their sequence), we will extract the ones that did not have any
+mismatches in the 6 first nts. Indeed, those reads are most likely due to mispriming event from the reverse transcriptase
+primer. We will then remove those reads from the alignment obtained with the reads that did not contain the molecular
+barcode in the beginning of their sequence.
+
+use : python extractReadsWithMismatchesIn6FirstNct.py stdin stdout
+
+"""
+
+import sys, pysam, re
+
+iBAM = pysam.Samfile('-', 'r') # reads from the standard input
+oBAM = pysam.Samfile('-', 'w', template=iBAM) # output to the standard output
+
+for line in iBAM:
+    md=re.findall(r'\d+', [tag[1] for tag in line.tags if tag[0]=='MD'][0])	# MD: string of Match and Mismatches positions, start after soft clipping (left most),
+    if len(md) == 1 :       # if there are no mismatches
+        oBAM.write(line)    # write the alignment into the output file
+    else:
+        if (not line.is_reverse) and (int(md[0]) >= 6): # if the first mismatch occurs after the 6th nt (from the 5' end)
+            oBAM.write(line)                           # write the alignment into the output file
+        elif (line.is_reverse) and (int(md[-1]) >= 6):  # same as above but for reads that align to the reverse strand, last element because we are interested in the 5 ' end
+            oBAM.write(line)
+
+# close files
+iBAM.close()
+oBAM.close()
+
diff --git a/source/getSICoordinates1Based.r b/source/getSICoordinates1Based.r
new file mode 100755
index 0000000..c2d22d2
--- /dev/null
+++ b/source/getSICoordinates1Based.r
@@ -0,0 +1,50 @@
+suppressPackageStartupMessages(library(data.table))
+
+args <- commandArgs(trailingOnly = TRUE) 
+
+file=args[1]
+out=args[2]
+
+table=fread(file,sep="\t",header=F,fill=T) #stdin()
+table=table[table$V3=="exon",]
+
+temp=unlist(strsplit(as.character(table$V9),split="; "))
+exon=temp[grepl("exon_number [1-9]+", temp, perl=T)]
+table$V10=as.numeric(sub("exon_number ([1-9]+)", "\\1", exon, perl=T))
+
+temp=unlist(strsplit(as.character(table$V9),split="; "))
+transcriptId=temp[grepl("transcript_id \"*\"", temp, perl=T)]
+table$V11=as.character(sub("transcript_id \"*\"", "\\1", transcriptId, perl=T))
+
+x=table[,.(exonNum=length(V10)),by=V11]
+
+table$V12=rep(x$exonNum,x$exonNum)
+
+
+Result=c()
+
+#nrow(table)
+for (i in 1:nrow(table)){
+  if ((i%%100000)==0){
+    print(i)
+  }
+  # 3' Exon positive strand
+  if (table[i,10]!=table[i,12] && table[i,7]=="+"){
+    Result=c(Result,paste(table[i,1],"pos",table[i,5],sep="_"))
+  }
+  # 3' Intron positive strand
+  if (table[i,10]!=1 && table[i,7]=="+"){
+    Result=c(Result,paste(table[i,1],"pos",as.numeric(table[i,4])-1,sep="_"))
+  }
+  # 3' Exon negative strand strand
+  if (table[i,10]!=table[i,12] && table[i,7]=="-"){
+    Result=c(Result,paste(table[i,1],"neg",table[i,4],sep="_"))
+  }
+  # 3' Intron negative strand
+  if (table[i,10]!=1 && table[i,7]=="-"){
+    Result=c(Result,paste(table[i,1],"neg",as.numeric(table[i,5])+1,sep="_"))
+  }
+}
+
+write.table(unique(Result), col.names = F, row.names = F, quote=F, file=out)
+
diff --git a/source/joinBedGraph.r b/source/joinBedGraph.r
new file mode 100755
index 0000000..a880e1b
--- /dev/null
+++ b/source/joinBedGraph.r
@@ -0,0 +1,29 @@
+args <- commandArgs(trailingOnly = TRUE) 
+options(scipen=999)
+suppressPackageStartupMessages(library(data.table))
+
+bedGraph=args[1]
+
+bedGraph=fread(bedGraph, showProgress=F)
+bedGraphResult=cbind(bedGraph[,c(1,2,3)],rowSums(bedGraph[,-c(1,2,3)]))
+
+colnames(bedGraphResult)=c("V1","V2","V3","V4")
+bedGraphResult=bedGraphResult[bedGraphResult$V4!=0,]
+
+index=which((bedGraphResult$V3-bedGraphResult$V2)>1)
+new=data.table(V1=character(),V2=numeric(),V3=numeric(),V4=numeric())
+for (idx in index){
+  num=bedGraphResult$V3[idx]-bedGraphResult$V2[idx]
+  for (j in 0:(num-1)){
+    new=rbind(new,data.table(bedGraphResult$V1[idx],
+                           bedGraphResult$V2[idx]+j,
+                           bedGraphResult$V2[idx]+j+1,
+                           bedGraphResult$V4[idx]))  
+  } 
+}
+
+bedGraphResult=rbind(bedGraphResult[-index,],new)
+
+write.table(bedGraphResult,row.names = F,col.names = F, quote = F, sep="\t")
+
+
diff --git a/source/normalizeBedgraph.sh b/source/normalizeBedgraph.sh
new file mode 100755
index 0000000..f74fb8b
--- /dev/null
+++ b/source/normalizeBedgraph.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+Files=$1
+result_dir=$2
+mapping_dir=$3
+scriptDir=$4
+normalized_dir=$5
+sampleID=$6
+reference=$7
+
+alignedRef=0
+alignedSpikeIn=0
+
+FILES=`echo $Files | tr ";" "\n"`
+
+for f in $FILES;
+do	
+	base=`basename $f`
+	name=${base%.*}
+	# number of reads	
+	temp=`less $mapping_dir/$name/${name}_Aligned.aligned.stat|cut -f1`
+	alignedRefTemp=$((alignedRef + temp))
+	alignedRef=$alignedRefTemp
+
+	temp=`less $mapping_dir/$name/${name}_Aligned.aligned.stat|cut -f2`
+	alignedSpikeInTemp=$((alignedSpikeIn + temp))
+	alignedSpikeIn=$alignedSpikeInTemp	
+done
+
+echo -e "$alignedRef\t$alignedSpikeIn" > $mapping_dir/Aligned.stat
+
+Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph $alignedSpikeIn \
+> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_spikeInNorm_${reference}_stt_pos.bedGraph
+
+Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph $alignedSpikeIn \
+> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_spikeInNorm_${reference}_stt_neg.bedGraph
+
+Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_pos.bedGraph $alignedRef \
+> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_tradNorm_${reference}_stt_pos.bedGraph
+
+Rscript $scriptDir/source/normalizedBedgraph.r $result_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_${reference}_stt_neg.bedGraph $alignedRef \
+> $normalized_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_tradNorm_${reference}_stt_neg.bedGraph
+
diff --git a/source/normalizedBedgraph.r b/source/normalizedBedgraph.r
new file mode 100755
index 0000000..ec98bfb
--- /dev/null
+++ b/source/normalizedBedgraph.r
@@ -0,0 +1,13 @@
+args <- commandArgs(trailingOnly = TRUE) 
+suppressPackageStartupMessages(library(data.table))
+
+bedgraph=args[1]
+read_number=as.numeric(args[2])
+
+Bedgraph=fread(bedgraph,fill=T,sep="\t")
+
+scaling=1/(read_number/1000000)
+
+Bedgraph$V4=round(Bedgraph$V4*scaling,2)
+
+write.table(Bedgraph,sep="\t",quote =F,col.names = F,row.names = F)
diff --git a/source/removePCRduplicateStringent.py b/source/removePCRduplicateStringent.py
new file mode 100755
index 0000000..f85f9a6
--- /dev/null
+++ b/source/removePCRduplicateStringent.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+"""
+
+Date : May 2nd 2014
+
+Author : Julia di Iulio
+
+script used to extract all reads starting at each position from a coverage file (only position with score > 0), to then
+keep only 1 read per molecular barcode (to avoid getting signal from PCR duplicates). Ideally we should check the
+distribution of the MB, but as it is unlikely to have very high coverage at a single position (due to the large size of the
+human (un-)stable transcriptome), we are gonna chose the easy way and keep only 1 read per molecular barcode.
+In the future, we could also take into consideration the insert size of the read, and the mismatches if any.
+
+use : python removePCRduplicateStringent.py inBedGraph1 (with start coverage)                                                  [1]
+                                            inTxt1  (file containing splicing intermediates (SI) positions with the folowing
+                                                    nomenclature : chrX_strand_Start (where Start position are 1 based)        [2]
+                                            inBAM1  (input BAM file with only unique alignment)                                [3]
+                                            outBedGraph1 (containing no SI, and only non duplicated reads)                          [4]
+                                            outBedGraph2 (containing only non duplicated reads, but still containing the SI if any) [5]
+
+"""
+
+import sys, pysam, os, numpy, re
+
+
+iBG1  = open(sys.argv[1], 'r')
+iTXT1 = set(open(sys.argv[2], 'r').readlines())
+iBAM1 = pysam.Samfile(sys.argv[3], 'rb')
+oBG1 = open(sys.argv[4], 'w')
+oBG2 = open(sys.argv[5], 'w')
+
+if '_pos.bedGraph' in sys.argv[1]:
+    std = 'pos'
+elif '_neg.bedGraph' in sys.argv[1]:
+    std = 'neg'
+
+#iBG1.readline() # to skip the first line describing bedfile
+count = 0
+
+for bgLine in iBG1:
+    count += 1
+    print(count)
+    chrom, st, ed , cov = bgLine.rstrip().split('\t')
+    if float(cov) > 0:
+        for j in range(0,(int(ed)-int(st))):
+            MB = set()
+            SI = 0
+            woSI = 0 # without SI
+            wiSI = 0 # with SI
+            if chrom+"_"+std+"_"+str(int(st)+j+1)+"\n" in iTXT1:
+                SI = 1
+            for line in iBAM1.fetch(chrom, int(st)+j, int(st)+j+1):
+                if (line.is_reverse and std=='pos' and line.aend-1 == int(st)) or \
+                        (not line.is_reverse and std=='neg' and line.pos == int(st)):
+                    mb = line.qname.split('_MolecularBarcode:')[1]
+                    if mb not in MB:
+                        MB.add(mb)
+                        if SI == 0:
+                            woSI += 1
+                        wiSI += 1
+            
+            if SI == 0 and woSI > 0:
+                oBG1.write("%s\t%s\t%s\t%i\n" % (chrom, str(int(st)+j), str(int(st)+j+1), woSI))
+            if wiSI > 0:
+                oBG2.write("%s\t%s\t%s\t%i\n" % (chrom, str(int(st)+j), str(int(st)+j+1), wiSI))
+
+
+iBG1.close()
+iBAM1.close()
+oBG1.close()
+oBG2.close()
diff --git a/source/removePCRduplicates.sh b/source/removePCRduplicates.sh
new file mode 100755
index 0000000..543cd9b
--- /dev/null
+++ b/source/removePCRduplicates.sh
@@ -0,0 +1,150 @@
+cov_dir=$1
+bam_dir=$2
+scriptDir=$3
+SIfile=$4
+sampleID=$5
+samtools=$6
+idxDir=$7
+bedtools=$8
+
+iBamUniq=$bam_dir/${sampleID}_noRTbias.sorted_uniq.bam
+iCovUniqP=$cov_dir/${sampleID}_uniq_noRTbias_stt_pos.bedGraph
+iCovUniqN=$cov_dir/${sampleID}_uniq_noRTbias_stt_neg.bedGraph
+   
+oCovUniqP=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_pos.bedGraph
+oCovUniqN=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_neg.bedGraph
+
+oCovUniqPnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph
+oCovUniqNnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph
+
+bam_split=$bam_dir/Contigs
+cov_split=$cov_dir/Contigs
+
+mkdir -p $cov_split
+mkdir -p $bam_split
+echo $idxDir
+grep -P -v ".*tRNA.*" $idxDir/chrName.txt > $bam_split/chrom.txt
+
+while read contig; do
+	echo $contig
+	bamContig=$bam_split/${sampleID}_noRTbias.$contig.bam
+
+	bedGraphContigP=$cov_split/${sampleID}_${contig}_pos.bedGraph
+	bedGraphContigPNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_pos.bedGraph
+	bedGraphContigPNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_pos.bedGraph
+
+	bedGraphContigN=$cov_split/${sampleID}_${contig}_neg.bedGraph
+	bedGraphContigNNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_neg.bedGraph
+	bedGraphContigNNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_neg.bedGraph		
+
+	if [ ! -f $bamContig ];then
+		$samtools view -b $iBamUniq $contig > $bamContig
+	fi
+	if [ ! -f $bamContig.bai ];then
+		$samtools index $bamContig
+	fi
+
+	if [ ! -f $bedGraphContigP ];then
+		grep -P "^$contig\t" $iCovUniqP > $bedGraphContigP
+	fi
+	
+	if [ ! -f $bedGraphContigN ];then
+		grep -P "^$contig\t" $iCovUniqN > $bedGraphContigN
+	fi
+		
+	if [ ! -f $bedGraphContigPNoPCR ] || [ ! -f $bedGraphContigPNoSINoPCRdup ];then
+		python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigP $SIfile $bamContig $bedGraphContigPNoSINoPCRdup $bedGraphContigPNoPCR
+	fi
+
+	if [ ! -f $bedGraphContigNNoPCR ] || [ ! -f $bedGraphContigNNoSINoPCRdup ];then
+		python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigN $SIfile $bamContig $bedGraphContigNNoSINoPCRdup $bedGraphContigNNoPCR
+	fi
+	 
+done <$bam_split/chrom.txt
+list=`grep -P ".*tRNA.*" $idxDir/chrName.txt | tr "\n" " "`
+bamContig=$bam_split/${sampleID}_noRTbias.tRNA.bam
+
+bedGraphContigP=$cov_split/${sampleID}_tRNA_pos.bedGraph
+bedGraphContigPNoPCR=$cov_split/${sampleID}_tRNA_noPCRdup_pos.bedGraph
+bedGraphContigPNoSINoPCRdup=$cov_split/${sampleID}_tRNA_noPCRdup_noSI_pos.bedGraph
+
+bedGraphContigN=$cov_split/${sampleID}_tRNA_neg.bedGraph
+bedGraphContigNNoPCR=$cov_split/${sampleID}_tRNA_noPCRdup_neg.bedGraph
+bedGraphContigNNoSINoPCRdup=$cov_split/${sampleID}_tRNA_noPCRdup_noSI_neg.bedGraph		
+
+if [ ! -f $bamContig ];then
+	$samtools view -b $iBamUniq $list > $bamContig
+fi
+if [ ! -f $bamContig.bai ];then
+	$samtools index $bamContig
+fi
+
+if [ ! -f $bedGraphContigP ];then
+	grep -P "^.*tRNA.*\t" $iCovUniqP > $bedGraphContigP
+fi
+	
+if [ ! -f $bedGraphContigN ];then
+	grep -P "^.*tRNA.*\t" $iCovUniqN > $bedGraphContigN
+fi
+		
+if [ ! -f $bedGraphContigPNoPCR ] || [ ! -f $bedGraphContigPNoSINoPCRdup ];then
+	python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigP $SIfile $bamContig $bedGraphContigPNoSINoPCRdup $bedGraphContigPNoPCR
+fi
+
+if [ ! -f $bedGraphContigNNoPCR ] || [ ! -f $bedGraphContigNNoSINoPCRdup ];then
+	python $scriptDir/source/removePCRduplicateStringent.py $bedGraphContigN $SIfile $bamContig $bedGraphContigNNoSINoPCRdup $bedGraphContigNNoPCR 
+fi
+
+oCovUniqP=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_pos.bedGraph
+oCovUniqN=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_stt_neg.bedGraph
+
+oCovUniqPnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_pos.bedGraph
+oCovUniqNnoSI=$cov_dir/${sampleID}_uniq_noRTbias_noPCRdup_noSI_stt_neg.bedGraph
+
+echo "tRNA" | cat $bam_split/chrom.txt - > $bam_split/chrom_tRNA.txt
+
+
+while read contig; do
+	echo $contig
+	bedGraphContigPNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_pos.bedGraph
+	bedGraphContigPNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_pos.bedGraph
+
+	bedGraphContigNNoPCR=$cov_split/${sampleID}_${contig}_noPCRdup_neg.bedGraph
+	bedGraphContigNNoSINoPCRdup=$cov_split/${sampleID}_${contig}_noPCRdup_noSI_neg.bedGraph		
+	
+	if [ ! -f $oCovUniqP ] || [ ! -f $oCovUniqN ] || [ ! -f $oCovUniqPnoSI ] || [ ! -f $oCovUniqNnoSI ];then
+		rm -f $oCovUniqP
+		rm -f $oCovUniqN
+
+		rm -f $oCovUniqPnoSI
+		rm -f $oCovUniqNnoSI
+
+		cp $bedGraphContigPNoPCR $oCovUniqP
+		cp $bedGraphContigNNoPCR $oCovUniqN
+
+		cp $bedGraphContigPNoSINoPCRdup $oCovUniqPnoSI
+		cp $bedGraphContigNNoSINoPCRdup $oCovUniqNnoSI
+	else
+		cat $oCovUniqPnoSI $bedGraphContigPNoSINoPCRdup > $oCovUniqPnoSI.temp
+		cat $oCovUniqNnoSI $bedGraphContigNNoSINoPCRdup > $oCovUniqNnoSI.temp
+
+		cat $oCovUniqP $bedGraphContigPNoPCR > $oCovUniqP.temp
+		cat $oCovUniqN $bedGraphContigNNoPCR > $oCovUniqN.temp
+
+		mv $oCovUniqPnoSI.temp $oCovUniqPnoSI
+		mv $oCovUniqNnoSI.temp $oCovUniqNnoSI
+		mv $oCovUniqP.temp $oCovUniqP
+		mv $oCovUniqN.temp $oCovUniqN
+	fi
+
+done < $bam_split/chrom_tRNA.txt
+
+$bedtools sort -i $oCovUniqPnoSI > $oCovUniqPnoSI.temp
+$bedtools sort -i $oCovUniqNnoSI > $oCovUniqNnoSI.temp
+$bedtools sort -i $oCovUniqP > $oCovUniqP.temp
+$bedtools sort -i $oCovUniqN > $oCovUniqN.temp
+
+cp $oCovUniqPnoSI.temp $oCovUniqPnoSI
+cp $oCovUniqNnoSI.temp $oCovUniqNnoSI
+cp $oCovUniqP.temp $oCovUniqP
+cp $oCovUniqN.temp $oCovUniqN
diff --git a/source/rtRemoval.sh b/source/rtRemoval.sh
new file mode 100755
index 0000000..2ccdbf9
--- /dev/null
+++ b/source/rtRemoval.sh
@@ -0,0 +1,52 @@
+bam=$1
+name=$2
+samtools=$3
+Bam1=$4
+Bam2=$5
+scriptDir=$6
+
+$samtools view -H $Bam1 > $bam/headers.sam # extracting header of mapping without barcode
+    	
+# convert bam to sam and make barcode visible for mapping without barcode		
+
+if [ ! -f $bam/temp.sam ];then # check if analysis already done
+	$samtools view -@ 20 $Bam1 | sed 's/_MolecularBarcode/\t_MB/' | sort -k1,1 > $bam/temp.sam
+fi
+
+if [ ! -f $bam/rtBiasnames.txt ];then # check if analysis already done
+	$samtools view -@ 20 -h $Bam2 | python $scriptDir/source/extractReadsWithMismatchesIn6FirstNct.py | cut -f1 |sort -k1,1 | uniq \
+	> $bam/rtBiasnames.txt
+fi
+   		
+# remove hits from rtBiasnames  # join read name and header again # compress to bam file after
+
+if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done
+	join -v 1 -1 1 -2 1 $bam/temp.sam \
+	$bam/rtBiasnames.txt | sed -e 's/ _MB/_MolecularBarcode/' -e 's/ /\t/g' | \
+	cat $bam/headers.sam - | \
+	$samtools view -@ 20 -Sb -o $bam/temp_noRTbias.bam - 
+fi
+
+if [ ! -f $bam/${name}_noRTbias.sorted.bam ];then # check if analysis already done
+	$samtools sort -@ 20 -m 5G $bam/temp_noRTbias.bam \
+	$bam/${name}_noRTbias.sorted
+fi
+
+if [ ! -f $bam/${name}_noRTbias.sorted.bam.bai ];then # check if analysis already done
+	$samtools index $bam/${name}_noRTbias.sorted.bam
+fi
+
+rm -f $bam/temp.sam
+rm -f $bam/temp_noRTbias.bam
+    	
+if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam ];then # check if analysis already done
+	$samtools view -@ 20 -b -h -q 50 $bam/${name}_noRTbias.sorted.bam \
+	> $bam/${name}_noRTbias.sorted_uniq.bam
+fi
+
+if [ ! -f $bam/${name}_noRTbias.sorted_uniq.bam.bai ];then # check if analysis already done
+	$samtools index $bam/${name}_noRTbias.sorted_uniq.bam
+fi
+
+
+