net-seq-pipeline

MayerGroup · Nov 17, 2017 · 60a4f2b · 60a4f2b
commit 60a4f2b
Show file tree

Hide file tree

Showing 19 changed files with 1,754 additions and 0 deletions.
diff --git a/NetSeqPipeline b/NetSeqPipeline
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+# NET-seq Pipeline
+
+## REQUIREMENTS
+- bedtools
+- curl
+- python 2.7, moduls:
+    - HTSeq
+    - pysam
+- R
+- samtools
+- STAR (2.5.3a)
+- wget
+- bzip2 or gzip if file is compressed, respectively
+
+## STEPS
+1. Molecular Barcode Extraction
+2. Reads mapping (both sets: with and without barcodes)
+3. Reverse transcriptase mispriming events removal
+4. Recording of the 5' end position (3' end position of the original nascent RNA)
+5. PCR duplicates removal
+6. Removal of sequencing reads due to splicing intermediates
+
+## USAGE
+1. Fill in the parameter files: parameter_dependencies.in and parameter_template.in
+2. Make the file runNetSeqPipe.netPipe executable
+    chmod -x runNetSeqPipe.netPipe
+3. Run the runNetSeqPipe.netPipe file
+    bash runNetSeqPipe.netPipe
diff --git a/parameter_STAR.in b/parameter_STAR.in
diff --git a/parameter_dependencies.in b/parameter_dependencies.in
@@ -0,0 +1,25 @@
+# starIndex: provide the full path to the STAR index directory
+# NetSeqPipeline will:
+#		1) create a large index files for the selected reference genome
+#       2) if index has been previously created: make sure it is available in the given starIndex location 
+#			in a subdirectory with name of the reference
+#			example: for a reference genome e.g. GRCh38, make sure the file is located in /pathToStarIndex/GRCh38/
+# IMPORTANT: make sure you have write permission
+# example: starIndex=/pathToStarIndex
+starIndex=
+
+# parametersSTAR: provide the full path to the STAR parameter directory + name of the parameter file
+# example: parametersSTAR=/pathToParameters/parameter_STAR.in
+parametersSTAR=
+
+# starApp: provide the full path to the STAR application directory + name of the application executable
+# example: starApp=/pathToStarApp/STAR
+starApp=
+
+# samtools: provide the full path to the samtools application directory + name of the application executable
+# example: samtools=/PathToSamtools/samtools
+samtools=
+
+# bedtools: provide the full path to the bedtools application directory + name of the application executable
+# example: bedtools=/PathToBedtools/bedtools
+bedtools=
diff --git a/parameter_template.in b/parameter_template.in
@@ -0,0 +1,31 @@
+# sampleID: necessary for naming the output files
+# example: sampleID=AB01 
+sampleID=
+
+# condition: necessary for naming the output files
+# example: condition=treated
+condition=
+
+# NetSeqApp: provide full path to the NeqSeqPipeline directory + name of the application executable
+# example: NetSeqApp=/pathToFolderContainingPipeline/NetSeqPipeline
+NetSeqApp=
+
+# pathToOutputFiles: provide the path to the requested output directory
+# IMPORTANT: make sure you have write permission
+# example: pathToOutputFiles=/outputDirectory 
+pathToOutputFiles=
+
+# files: provide full path + name of the fastq file(s)
+# IMPORTANT: make sure you have read permission
+# example: files=/pathToInputDirectory/samp1e.fq 
+files=
+
+# compression: provide compression method of the input file
+# IMPORTANT REMARK: choose from bz2, gz, none
+# example: compression=none
+compression=
+
+# reference: provide reference genome for the mapping
+# example: reference=GRCh38
+reference=
+
diff --git a/runNetSeqPipe.netPipe b/runNetSeqPipe.netPipe
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+# version v2: Annkatrin Bressin, Martyna Gajos
+# script to run NetSeqPipeline
+# provide parameters for the analysis in the files: parameter_template.in and parameter_dependencies.in
+
+source parameter_template.in
+source parameter_dependencies.in
+
+mkdir -p $pathToOutputFiles
+
+bash $NetSeqApp -o $pathToOutputFiles -id $sampleID -f $files -c $compression -r $reference -si $starIndex -sa $starApp -st $samtools -bt $bedtools -p $parametersSTAR
+
diff --git a/source/appendCCAinFasta.r b/source/appendCCAinFasta.r
@@ -0,0 +1,18 @@
+suppressPackageStartupMessages(library(Biostrings))
+
+args <- commandArgs(trailingOnly = TRUE) 
+
+file=args[1]
+outfile=args[2]
+
+data=readDNAStringSet(file)
+
+seq=paste(data)
+names=names(data)
+
+seq=paste(seq,"CCA",sep="")
+
+data2=DNAStringSet(x=seq)
+names(data2) <- names
+
+writeXStringSet(data2,filepath = outfile, format="fasta")
diff --git a/source/coverage.sh b/source/coverage.sh
@@ -0,0 +1,17 @@
+samtools=$1
+Bam=$2
+scriptDir=$3
+outName=$4
+
+$samtools view -@20 $Bam | sed 's/\tjM.*$//' | python $scriptDir/source/customCoverage.py $outName
+
+tail -n +2 ${outName}stt_neg.bedGraph > ${outName}stt_neg.bedGraph.temp
+tail -n +2 ${outName}stt_pos.bedGraph > ${outName}stt_pos.bedGraph.temp
+
+cp ${outName}stt_neg.bedGraph.temp ${outName}stt_neg.bedGraph
+cp ${outName}stt_pos.bedGraph.temp ${outName}stt_pos.bedGraph
+
+# jM:B:c,M1,M2,...
+# intron motifs for all junctions (i.e. N in CIGAR): 
+# 0: non-canonical; 1:GT/AG, 2:CT/AC, 3:GC/AG, 4:CT/GC, 5:AT/AC, 6:GT/AT.
+# If splice junctions database is used, and a junction is annotated, 20 is added to its motif value 
diff --git a/source/customCoverage.py b/source/customCoverage.py
@@ -0,0 +1,32 @@
+#! /usr/bin/prun python-2 python
+
+"""
+date : 17 april 2013
+
+Author : Julia di Iulio
+
+Script used to monitor coverage (either the whole read, the start of the read (which represents PolII position),
+or the end of the read) taking into account the number of times the read mapped (given by the NH:i: tag in the
+alignment file): by giving a weight inversely proportional to the number of times the read mapped.
+This script is highly inspired from http://www-huber.embl.de/users/anders/HTSeq/doc/tour.html
+
+use: python customCoverage.py stdin (alignment file in sam format) Experiment Name [1]
+
+"""
+
+
+import sys, HTSeq
+
+stt = HTSeq.GenomicArray( "auto", stranded=True, typecode='i' ) #stt stands for start of reads
+
+alignment_file = HTSeq.SAM_Reader(sys.stdin)
+
+for algt in alignment_file: 
+    if algt.aligned:    
+        # extract the start position of the read as a genomic interval
+        stt_iv = HTSeq.GenomicInterval( algt.iv.chrom , algt.iv.start_d , algt.iv.start_d + 1, algt.iv.strand )
+        # adds weighted coverage at the position corresponding to the start of the read
+        stt[ stt_iv ] += 1
+
+stt.write_bedgraph_file( sys.argv[1] + "stt_neg.bedGraph", "+" ) # switch strands because 5' start of the reads correspond to the 3' end of the nascent RNA =^ pol2 position ???
+stt.write_bedgraph_file( sys.argv[1] + "stt_pos.bedGraph", "-" )
diff --git a/source/downloadData.sh b/source/downloadData.sh
@@ -0,0 +1,80 @@
+sequences=$1
+annotation=$2
+reference=$3
+scriptDir=$4
+info=$5
+
+case $reference in
+
+	GRCh37)		
+		if [ ! -f $sequences/GRCh37.primary_assembly.genome.fa ];then
+			wget -O $sequences/GRCh37.primary_assembly.genome.fa.gz ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz
+			gunzip $sequences/GRCh37.primary_assembly.genome.fa.gz			
+		fi
+
+		if [ ! -f $annotation/gencode.GRCh37.primary_assembly.annotation.gtf ];then
+			wget -O $annotation/gencode.GRCh37.primary_assembly.annotation.gtf.gz \
+			ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh37_mapping/gencode.v26lift37.annotation.gtf.gz
+			gunzip $annotation/gencode.GRCh37.primary_assembly.annotation.gtf.gz			
+		fi
+
+		if [ ! -f $sequences/GRCh37.tRNA.fa ];then
+			curl "http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi19/hg19-tRNAs.fa" > $sequences/GRCh37.tRNA.fa
+		fi
+
+		if [ ! -f $sequences/GRCh37.tRNA.CCA.fa ];then
+			Rscript $scriptDir/source/appendCCAinFasta.r $sequences/GRCh37.tRNA.fa $sequences/GRCh37.tRNA.CCA.fa		
+		fi		
+
+		GENOME=$sequences/GRCh37.primary_assembly.genome.fa
+		ANNOTATION=$annotation/gencode.GRCh37.primary_assembly.annotation.gtf
+
+		if [ ! -f $sequences/rDNA.Genbank.U13369.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U13369.1&amp;rettype=fasta" > $sequences/rDNA.Genbank.U13369.1.fa
+		fi
+
+		if [ ! -f $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U67616.1&amp;rettype=fasta" > $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa
+		fi		
+	;;
+
+	GRCh38)
+		if [ ! -f $sequences/GRCh38.all.genome.fa ];then
+			wget -O $sequences/GRCh38.all.genome.fa.gz ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/GRCh38.p10.genome.fa.gz
+			gunzip $sequences/GRCh38.all.genome.fa.gz		
+		fi
+
+		if [ ! -f $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf ];then
+			wget -O $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf.gz \
+			ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_26/gencode.v26.chr_patch_hapl_scaff.annotation.gtf.gz
+			gunzip $annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf.gz		
+		fi
+
+		if [ ! -f $sequences/GRCh38.tRNA.fa ];then
+			curl "http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa" > $sequences/GRCh38.tRNA.fa
+		fi
+
+		if [ ! -f $sequences/GRCh38.tRNA.CCA.fa ];then
+			Rscript $scriptDir/source/appendCCAinFasta.r $sequences/GRCh38.tRNA.fa $sequences/GRCh38.tRNA.CCA.fa		
+		fi
+
+		GENOME=$sequences/GRCh38.all.genome.fa
+		ANNOTATION=$annotation/gencode.GRCh38.chr_patch_hapl_scaff.annotation.gtf
+
+		if [ ! -f $sequences/rDNA.Genbank.U13369.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U13369.1&amp;rettype=fasta" > $sequences/rDNA.Genbank.U13369.1.fa
+		fi
+
+		if [ ! -f $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa ] ;then
+			curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&amp;id=U67616.1&amp;rettype=fasta" > $sequences/28SrRNAPseudogenes.Genebank.U67616.1.fa
+		fi
+	;;
+
+	*)
+		status="### $reference isn't a valid input argument"
+		echo $status
+		exit 0
+		;;
+esac
+
+echo -e "$GENOME\n$ANNOTATION\n$PREFIX" > $info
diff --git a/source/extractMolecularBarcode.py b/source/extractMolecularBarcode.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+
+"""
+Date : April 24th 2014
+
+Author : Julia di Iulio
+Modified: Annkatrin Bressin 
+
+script used to extract molecular barcodes from the reads (fastq format) but keeping them associated to the read.
+Running the script will output :
+        a new fastq file (with the molecular barcode removed from the read sequence, but the information is kept in the
+        header of the read)
+        a file containing the molecular barcode counts
+        a file containing the ligation hexamer counts
+
+The two files containing the counts can be further used to investigate the distribution of the molecular barcodes and/or
+the ligation hexamers (script not provided).
+
+use : python extractMolecularBarcode.py inFastq outFastq outBarcodes outLigation
+
+"""
+
+import sys, itertools
+
+iFastq=open(sys.argv[1], 'r')
+oFastq=open(sys.argv[2], 'w')
+oBarcode=open(sys.argv[3], 'w')
+oLigation=open(sys.argv[4], 'w')
+oLen=open(sys.argv[5], 'w')
+
+
+dicoBarcode={}  # creates a dictionnary that will contain Molecular Barcode counts (see below)
+dicoLigation={} # creates a dictionnary that will contain ligation hexamer counts (see below)
+nct='ACTGN'     # nct stands for nucleotide
+
+# fill two dictionaries with the keys being all possible molecular barcodes/ligation hexamers made with the letters 'A', 'C', 'G',
+# 'T' and 'N', and the values being set to 0 for now; the values will be incremented every time a molecular barcode/ligation hexamers
+# is identified in a read
+for barcode in list(itertools.product(nct, repeat=6)):
+    dicoBarcode["".join(barcode)] = 0
+    dicoLigation["".join(barcode)] = 0
+
+
+header= iFastq.readline().rstrip() # reads the first line of the fastq file (which corresponds to the header of the first read)
+while header != '':
+    totseq   = iFastq.readline() # reads the sequence of the first read (the first 6 nucleotides being the molecular barcode (MBC))
+    plus     = iFastq.readline() # reads the 3rd line of a read in a fastq format
+    totqual  = iFastq.readline() # reads the quality of the read (also containing the PHRED quality score of the MBC)
+    barcode  = totseq[0:6]    # assigns the 6 first nucleotide (nt) of the read sequence to the variable "barcode"
+    ligation = totseq[3:9]    # assigns the last 3 nts of the MBC and the first 3 nts of the RNA fragment to the variable "ligation"
+    seq  = totseq[6:]         # assigns the RNA fragment sequence to the variable "seq"
+    qual = totqual[6:]        # assigns the RNA fragment PHRED quality score to the variable "qual"
+    oFastq.write(header.split(" ")[0]+'_MolecularBarcode:'+barcode+' '+header.split(" ")[1]+'\n')
+    # writes the header of the reads (containing the MBC info) to the output fastq file
+    oFastq.write(seq)  # writes the RNA fragment sequence to the output fastq file
+    oFastq.write(plus) # writes the 3rd line of a read in a fastq format to the output fastq file
+    oFastq.write(qual) # writes the RNA fragment PHRED quality score to the output fastq file
+    header= iFastq.readline().rstrip() # read the header of the following read
+    sequenceLen=len(seq)-1
+    ## in case the user doesn't use STAR for the alignment, he/she may have used an adapter trimmer to remove the adapter at
+    ## the 3'end of the reads in which case and/or cleaned the reads from the 3'end... so that some sequences will be shorter
+    ## than 6 or 9 nts... in which case we can not extract the info of the MBC and/or the ligation.
+    ## In such case uncomment the 4 next lines and comment the following lines.
+    #if len(totseq) >= 6 :
+    #    dicoBarcode[barcode] += 1
+    #if len(totseq) >= 9 :
+    #    dicoLigation[ligation] += 1
+    dicoBarcode[barcode] += 1   # adds 1 count in the MBC dictionary to the specific MBC found in the read
+    dicoLigation[ligation] += 1 # adds 1 count in the ligation dictionary to the specific ligation found in the read
+
+for barcode, times in dicoBarcode.items():              # outputs a file containing each possible MBC (column 1) and the respective
+    oBarcode.write("%s\t%s\n" % (barcode, str(times)))  # number of times (column 2) it was found in a read.
+
+for ligation, times in dicoLigation.items():            # outputs a file containing each possible ligation hexamer (column 1) and
+    oLigation.write("%s\t%s\n" % (ligation, str(times)))# the respective number of times (column 2) it was found in a read
+
+oLen.write("%s" % str(sequenceLen))
+
+# close files
+iFastq.close()
+oFastq.close()
+oBarcode.close()
+oLigation.close()
+oLen.close()
+
+
+
diff --git a/source/extractReadsWithMismatchesIn6FirstNct.py b/source/extractReadsWithMismatchesIn6FirstNct.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+"""
+
+Date : May 9th 2014
+
+Author : Julia di Iulio
+
+Script used to filter (from the alignment obtained with the reads that contained the molecular barcode in the beginning of
+their sequence) reads that contain mismatches in the 6 first nucleotides (nts). The rationale is that it is extremely
+unlikely (probability = 1/4e6) that the molecular barcode (which is a random hexamer sequence) corresponds exactly to the
+genomic sequence next to where the rest of the read aligned to. So among the reads that aligned (even though they still
+contained the molecular barcode in the beginning of their sequence), we will extract the ones that did not have any
+mismatches in the 6 first nts. Indeed, those reads are most likely due to mispriming event from the reverse transcriptase
+primer. We will then remove those reads from the alignment obtained with the reads that did not contain the molecular
+barcode in the beginning of their sequence.
+
+use : python extractReadsWithMismatchesIn6FirstNct.py stdin stdout
+
+"""
+
+import sys, pysam, re
+
+iBAM = pysam.Samfile('-', 'r') # reads from the standard input
+oBAM = pysam.Samfile('-', 'w', template=iBAM) # output to the standard output
+
+for line in iBAM:
+    md=re.findall(r'\d+', [tag[1] for tag in line.tags if tag[0]=='MD'][0])	# MD: string of Match and Mismatches positions, start after soft clipping (left most),
+    if len(md) == 1 :       # if there are no mismatches
+        oBAM.write(line)    # write the alignment into the output file
+    else:
+        if (not line.is_reverse) and (int(md[0]) >= 6): # if the first mismatch occurs after the 6th nt (from the 5' end)
+            oBAM.write(line)                           # write the alignment into the output file
+        elif (line.is_reverse) and (int(md[-1]) >= 6):  # same as above but for reads that align to the reverse strand, last element because we are interested in the 5 ' end
+            oBAM.write(line)
+
+# close files
+iBAM.close()
+oBAM.close()
+