diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..65cff2e --- /dev/null +++ b/Snakefile @@ -0,0 +1,294 @@ +import os + +sampleID = config["sampleID"] +WORKDIR = os.getcwd() +spikeIn = config["spikeIn"] +reference = config["reference"] +mask_bed = config["mask_bed"] + +def resultFiles(spikeIn, reference, sampleID): + if spikeIn=="no" : + bigwigs=expand("BigWig/{sampleID}/{type}{sampleID}.{strand}.bw", sampleID=sampleID, type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"]) + if mask_bed!='': + masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"]) + return bigwigs+masked + return bigwigs + else : + bedgraphs=expand("Bedgraphs/{sampleID}/{ref}/{type}{sampleID}.{strand}.bedGraph", sampleID=sampleID, ref = [reference,spikeIn], type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"]) + normalized=expand("Results/Normalized_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"]) + if mask_bed!='': + masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"]) + return bedgraphs+normalized+masked + return bedgraphs+normalized + +rule all: + input: + resultFiles(spikeIn, reference, sampleID), + "Stats/"+sampleID+".csv" + +rule create_link_fastq: + params: + file = config["file"], + out = WORKDIR+"/Data/" + sampleID + ".fastq." + config["compression"] + output: + temp("Data/" + sampleID + ".fastq." + config["compression"]) + threads: 1 + resources: + memory = 10, + time = 1 + shell: + "ln -s {params.file} {params.out}" + +rule decompress: + input: + "Data/" + sampleID + ".fastq." + config["compression"] + output: + temp("TemporaryData/"+sampleID+".fastq") + params: + compression = config["compression"] + resources: + memory = 50, + time = 1 + threads: 1 + run: + if params.compression=="bz2": + shell("bunzip2 -c -k {input} > {output}") + elif params.compression=="gz": + shell("gunzip -c -k {input} > {output}") + elif params.compression=="no": + shell("cp {input} {output}") + +rule remove_adapter: + input: + "TemporaryData/"+sampleID+".fastq" + output: + temp("TemporaryData/"+sampleID+"_noAdapter.fastq") + params: + cutadapt=config["cutadapt"], + cutadapt_params=config["cutadapt_params"], + min_len=config["barcode_len"] + 10 + resources: + memory = 100, + time = 4 + threads: + 30 + shell: + "{params.cutadapt} {params.cutadapt_params} -m {params.min_len} -j {threads} -o {output} {input}" + +rule fastqc: + input: + "TemporaryData/"+sampleID+"_noAdapter.fastq" + output: + "QC/"+sampleID+"_noAdapter_fastqc.html" + threads: + 1 + params: + fastqc=config["fastqc"], + outdir=WORKDIR+"/QC/" + shell: + "{params.fastqc} --outdir={params.outdir} -i {input}" + +rule collapse_reads: + input: + "TemporaryData/"+sampleID+"_noAdapter.fastq" + output: + "TemporaryData/"+sampleID+"_collapsed.txt" + params: + starcode=config["starcode"], + starcode_params=config["starcode_params"] + resources: + memory = 800, + time = 8 + threads: + 10 + shell: + "{params.starcode} {params.starcode_params} -t {threads} -o {output} -i {input}" + +rule barcode_dictionary: + input: + "TemporaryData/"+sampleID+"_collapsed.txt" + output: + "TemporaryData/"+sampleID+"_barcodes.pickle", + temp("TemporaryData/"+sampleID+"_noBC.fasta") + resources: + memory = 400, + time = 4 + threads: + 1 + params: + barcode_len=config["barcode_len"] + script: + "source/createBarcodeDictionary.py" + +rule map_reads: + input: + reads="TemporaryData/"+sampleID+"_noBC.fasta" + output: + temp("TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam") + params: + input="TemporaryData/"+sampleID+"_noBC.fasta", + star=config["star"], + params=config["star_params"], + star_index=config["star_index"], + output_prefix="TemporaryData/Mapping/"+sampleID+"_" + resources: + memory = 200, + time = 4 + threads: + 50 + shell: + "{params.star} --outSAMtype BAM Unsorted --genomeDir {params.star_index} --readFilesIn {params.input} --parametersFiles {params.params} --runThreadN {threads} --outFileNamePrefix {params.output_prefix}" + +rule sort_bam: + input: + "TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam" + output: + "TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam" + params: + samtools=config["samtools"], + resources: + memory = 200, + time = 4 + threads: + 10 + shell: + "{params.samtools} sort -@ {threads} -o {output} {input}" + +rule parse_positions: + input: + "TemporaryData/"+sampleID+"_barcodes.pickle", + "TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam" + output: + "TemporaryData/"+sampleID+"_posDict.pickle" + params: + barcode_len=config["barcode_len"], + resources: + memory = 300, + time = 4 + threads: + 1 + script: + "source/parsePositions.py" + +rule create_bedgraphs: + input: + "TemporaryData/"+sampleID+"_posDict.pickle", + output: + "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+".pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+".neg.bedGraph" + params: + annotation=config["annotation"], + genome=config["genome_fa"], + barcode_len=config["barcode_len"], + barcode_linker=config["barcode_linker"], + threshold=0.05 + resources: + memory = 300, + time = 4 + threads: + 1 + script: + "source/createBedgraphs.py" + +rule create_stats: + input: + "TemporaryData/"+sampleID+".fastq", + "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph", + "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph" + output: + "Stats/"+sampleID+".csv", + "Stats/Number_"+sampleID+".pdf", + "Stats/Percent_"+sampleID+".pdf" + params: + config["stats_bed"], + config["prefix"], + config["mask_downstream"] + resources: + memory = 100, + time = 10, + temp_memory = 100 + threads: + 1 + script: + "source/createStats.py" + +rule create_bw: + input: + bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph" + output: + bw = "BigWig/"+sampleID+"/{X}.bw" + resources: + memory = 100, + time = 1 + params: + bedGraphToBigWig = config["bedGraphToBigWig"], + chrLen = config["star_index"] + "chrNameLength.txt" + threads: + 1 + shell: + """ + sort -k1,1 -k2,2n {input.bg} > {input.bg}.temp + {params.bedGraphToBigWig} {input.bg}.temp {params.chrLen} {output.bw} + rm -f {input.bg}.temp + """ + +rule splitBedgraph: + params: + prefix = config["prefix"], + input: + bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph" + output: + reference_bg = "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph", + spikeIn_bg = "Bedgraphs/"+sampleID+"/"+spikeIn+"/{X}.bedGraph" + resources: + memory = 10, + time = 1 + shell: + """ + grep -v {params.prefix} {input.bg} > {output.reference_bg} + grep {params.prefix} {input.bg} | sed "s/^{params.prefix}//g" > {output.spikeIn_bg} + """ + +rule normalize_with_spikeIn: + input: + local_spike=expand("Bedgraphs/"+sampleID+"/"+spikeIn+"/"+sampleID+".{strand}.bedGraph", strand = ["pos","neg"]), + local_sample="Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph" + output: + "Results/Normalized_{X}.bedGraph", + "Results/factor_{X}.txt" + resources: + memory = 100, + time = 1 + threads: + 1 + script: + "source/normalizeSpikeIn.py" + +rule maskRegions: + input: + "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph" + output: + "Results/Masked_{X}.bedGraph" + resources: + memory = 100, + time = 1 + threads: + 1 + params: + config["mask_bed"] if config["mask_bed"]!="" else "" + script: + "source/maskRegion.py" diff --git a/config_example.yaml b/config_example.yaml new file mode 100644 index 0000000..e479ad0 --- /dev/null +++ b/config_example.yaml @@ -0,0 +1,52 @@ + +# sample ids +sampleID : test + +# InputFile: +file: /project/ReadThrough/HiS-NET-Seq_pipeline/data/test/test.fastq.gz + +reference: GRCh38.p12 +spikeIn: GRCm38.p6 +prefix: Mus_musculus_ + +# compression type, gz or bz2 +compression: gz + +barcode_len: 10 + +barcode_linker: CTGTAGGCACCATCAAT + +# star path +star : /project/owlmayer/Applications/STAR-2.7.3a/bin/Linux_x86_64/STAR + +# star index path +star_index : /project/ReadThrough/HiS-NET-Seq_pipeline/data/GRCh38_p12_GRCm38_p6 + +star_params: /project/ReadThrough/HiS-NET-Seq_pipeline/data/Parameters.in + +annotation: /project/ReadThrough/HiS-NET-Seq_pipeline/data/gencode.v28.vM18.primary_assembly.annotation.gff3 + +genome_fa: /project/ReadThrough/HiS-NET-Seq_pipeline/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa + +#FastQC +fastqc : /usr/local/package/bin/fastqc + +# samtools +samtools : /usr/local/package/bin/samtools + +starcode: /project/owlmayer/Applications/starcode/starcode + +starcode_params: "-d 0" + +cutadapt: /usr/local/package/bin/cutadapt + +cutadapt_params: "-a ATCTCGTATGCCGTCTTCTGCTTG -a AAAAAAAAAAGGGGGGGGGGGGGG -a GGGGGGGGGGGGGGGGGGGGGGG -e 0.2 -q 5 --max-n 0.9" + +bedGraphToBigWig : /usr/local/package/bin/bedGraphToBigWig + +mask_bed: /project/ReadThrough/HiS-NET-Seq_pipeline/data/masked.bed + +star_params_unmapped: /project/ReadThrough/HiS-NET-Seq_pipeline/data/Parameters.unmappedin +stats_bed: /project/ReadThrough/HiS-NET-Seq_pipeline/data/maskingClasses.bed + +mask_downstream: 300 diff --git a/createConfig.sh b/createConfig.sh new file mode 100644 index 0000000..54f816a --- /dev/null +++ b/createConfig.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +SOURCE=${BASH_SOURCE[0]} +DIR=$( dirname "$SOURCE" ) +DIR_REALPATH=`realpath $DIR` + +sampleID=$1 +fastq=`realpath $2` +config=$3 + +STAR=`which STAR` +STAR_realpath=`realpath $STAR` + +fastqc=`which fastqc` +fastqc_realpath=`realpath $fastqc` + +samtools=`which samtools` +samtools_realpath=`realpath $samtools` + +starcode=`which starcode` +starcode_realpath=`realpath $starcode` + +cutadapt=`which cutadapt` +cutadapt_realpath=`realpath $cutadapt` + +bedGraphToBigWig=`which bedGraphToBigWig` +bedGraphToBigWig_realpath=`realpath $bedGraphToBigWig` + + +echo " +# sample ids +sampleID : $sampleID + +# InputFile: +file: $fastq + +reference: GRCh38.p12 +spikeIn: GRCm38.p6 +prefix: Mus_musculus_ + +# compression type, gz or bz2 +compression: gz + +barcode_len: 10 + +barcode_linker: CTGTAGGCACCATCAAT + +# star path +star : $STAR_realpath + +# star index path +star_index : $DIR_REALPATH/data/GRCh38_p12_GRCm38_p6 + +star_params: $DIR_REALPATH/data/Parameters.in + +annotation: $DIR_REALPATH/data/gencode.v28.vM18.primary_assembly.annotation.gff3 + +genome_fa: $DIR_REALPATH/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa + +#FastQC +fastqc : $fastqc_realpath + +# samtools +samtools : $samtools_realpath + +starcode: $starcode_realpath + +starcode_params: \"-d 0\" + +cutadapt: $cutadapt_realpath + +cutadapt_params: \"-a ATCTCGTATGCCGTCTTCTGCTTG -a AAAAAAAAAAGGGGGGGGGGGGGG -a GGGGGGGGGGGGGGGGGGGGGGG -e 0.2 -q 5 --max-n 0.9\" + +bedGraphToBigWig : $bedGraphToBigWig + +mask_bed: $DIR_REALPATH/data/masked.bed + +star_params_unmapped: $DIR_REALPATH/data/Parameters.unmappedin +stats_bed: $DIR_REALPATH/data/maskingClasses.bed + +mask_downstream: 300" > $config diff --git a/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa b/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa new file mode 120000 index 0000000..c36a190 --- /dev/null +++ b/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa @@ -0,0 +1 @@ +/project/owlmayerTemporary/owlmayer_Applications/01_NetSeqPipeline.v2.spikeIns/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa \ No newline at end of file diff --git a/data/GRCh38_p12_GRCm38_p6 b/data/GRCh38_p12_GRCm38_p6 new file mode 120000 index 0000000..1b9a218 --- /dev/null +++ b/data/GRCh38_p12_GRCm38_p6 @@ -0,0 +1 @@ +/project/owlmayerTemporary/STAR/RNAseq_STAR-2.7.3a/GRCh38_p12_GRCm38_p6/ \ No newline at end of file diff --git a/data/Parameters.in b/data/Parameters.in new file mode 100755 index 0000000..71ae3f1 --- /dev/null +++ b/data/Parameters.in @@ -0,0 +1,480 @@ +#### all parameteres default are found at /groups/churchman/jd187/Program/STAR/STAR-STAR_2.4.0d/parametersDefault file ############## + +### versions +#versionSTAR 020201 +# int>0: STAR release numeric ID. Please do not change this value! +#versionGenome 020101 020200 +# int>0: oldest value of the Genome version compatible with this STAR release. Please do not change this value! + +### PARAMETERS +#parametersFiles - +# string: name of a user-defined parameters file, "-": none. Can only be defined on the command line. +# provided at the command line level + +### RUN PARAMETERS +#runMode alignReads +# string: type of the run: alignReads ... map reads +# genomeGenerate ... generate genome files +# inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType to generate wiggle files. + +#runThreadN 1 +# int: number of threads to run STAR +# provided at the command line level in case we need to tweek the memory settings etc + +### GENOME PARAMETERS +#genomeDir ./GenomeDir/ +# string: path to the directory where genome files are stored (if runMode!=generateGenome) or will be generated (if runMode==generateGenome) +# provided at the command line level, in case we wanna provide a diff genome (f.ex. built without Jct) without changing the param file + +#genomeLoad NoSharedMemory +# mode of shared memory usage for the genome files +# string: LoadAndKeep ... load genome into shared and keep it in memory after run +# LoadAndRemove ... load genome into shared but remove it after run +# LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs +# Remove ... do not map anything, just remove loaded genome from memory +# NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + +### GENOME GENERATION PARAMETERS +#genomeFastaFiles - +# fasta files with genomic sequence for genome files generation, only used if runMode==genomeGenerate +# string(s): path(s) to the files from the working directory (separated by spaces) + +#genomeChrBinNbits 18 +# int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins + +#genomeSAindexNbases 14 +# int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. + +#genomeSAsparseD 1 +# int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + + +### INPUT FROM BAM +#inputBAMfile - +# string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + + +### READ PARAMETERS +#readFilesIn Read1 Read2 +# string(s): paths to files that contain input read1 (and, if needed, read2) +# given at the command line level as the read names is gonna depend on the sample name + +readFilesCommand cat +# string(s): cmd line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout +# For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +#readMapNumber -1 +# int: number of reads to map from the beginning of the file +# -1: map all reads + +#readMatesLengthsIn NotEqual +# string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +#clip3pNbases 0 +# int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. +# +#clip5pNbases 0 +# int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq ATCTCGTATGCCGTCTTCTGCTTG +# string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterMMp 0.21 +# double(s): max proportion of mismatches for 3p adpater clipping for each mate. If one value is given, it will be assumed the same for both mates. +# + +clip3pAfterAdapterNbases 1 +# int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + + + +### LIMITS +#limitGenomeGenerateRAM 31000000000 +# int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 200000000 +# int>0: max available buffers size (bytes) for input/output, per thread + +#limitOutSAMoneReadBytes 100000 +#int>0: max size of the SAM record for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax) + +#limitOutSJoneRead 1000 +# int>0: max number of junctions for one read (including all multi-mappers) + +#limitOutSJcollapsed 1000000 +# int>0: max number of collapsed junctions + +limitBAMsortRAM 64000000000 +# int>=0: maximum available RAM for sorting BAM. If =0, it will be set to the genome index size + + +### OUTPUT: GENERAL +#outFileNamePrefix ./ +# string: output files name prefix (including full or relative path). Can only be defined on the command line. +# we define it on the command line as it is depending on the sample name + +#outTmpDir - +# string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! +# - the temp directory will default to outFileNamePrefix_tmp + +#outStd Log +# string: which output will be directed to stdout (standard out) +# Log : log messages +# SAM : alignments in .sam format (which normally are output to Aligned.out.sam file), normal std output will go into Log.std.out +# BAM_Unsorted : alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted +# BAM_SortedByCoordinate : alignments in BAM format, unsorted. Requires --outSAMtype BAM SortedByCoordinate +# BAM_Quant : alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped Fastx +# string: output of unmapped reads (besides SAM) +# None : no output +# Fastx : output in separate fasta/fastq files, Unmapped.out.mate1/2 + +#outQSconversionAdd 0 +# int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + + +### OUTPUT: SAM/BAM +outSAMtype BAM SortedByCoordinate +# strings: type of SAM/BAM output +# 1st word: +# BAM : output BAM without sorting +# SAM : output SAM without sorting +# None : no SAM/BAM output +# 2nd, 3rd ... +# Unsorted : standard unsorted +# SortedByCoordinate : sorted by coordinate + +#outSAMmode Full +# string: mode of SAM output None : no SAM output +# Full : full SAM output +# NoQS : full SAM but without quality scores + +#outSAMstrandField None +# string: Cufflinks-like strand field flag None : not used +# intronMotif : strand derived from the intron motif. Reads with inconsisent and/or non-canonical introns are filtered out. + +#If you have stranded RNA-seq data, you do not need to use any specific STAR options. Instead, you need +#to run Cufflinks with the library option --library-type options. For example, cufflinks <…> - +#library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has +#to be used only for Cufflinks runs and not for STAR runs. + +outSAMattributes All +# string: a string of desired SAM attributes, in the order desired for the output SAM +# NH HI AS nM NM MD jM jI XS +# Standard : NH HI AS nM +# All : NH HI AS nM NM MD jM jI +# None + + +#outSAMunmapped None +# string: output of unmapped reads in the SAM format +# None : no output +# Within : output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + +#outSAMorder Paired +# string: type of sorting for the SAM output +# Paired: one mate after the other for all paired alignments +# PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +#outSAMprimaryFlag OneBestScore +# string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG +# OneBestScore : only one alignment with the best score is primary +# AllBestScore : all alignments with the best score are primary + +#outSAMreadID Standard +# string: read ID record type +# Standard : first word (until space) from the FASTx read ID line, removing /1,/2 from the end +# Number : read number (index) in the FASTx file + +#outSAMmapqUnique 255 +# int: 0 to 255: the MAPQ value for unique mappers + +#outSAMattrRGline - +# string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", +# e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". +# xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. +# Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. + +#outSAMheaderHD - +# strings: @HD (header) line of the SAM header + +#outSAMheaderPG - +# strings: extra @PG (software) line of the SAM header (in addition to STAR) +# +#outSAMheaderCommentFile - +# string: path to the file with @CO (comment) lines of the SAM header +# +#outBAMcompression -1 +# int: -1 ... 10 BAM compression level, -1=default compression, 0=no compression, 10=maximum compression +# +#bamRemoveDuplicatesType - +# string: remove duplicates from BAM file, for now only works with sorted BAM feeded with inputBAMfile +# UniqueIdentical : removes all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical +# +#bamRemoveDuplicatesMate2basesN 0 +# int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) +# + +### OUTPUT WIGGLE +#outWigType None +# string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p" +# 1st word: +# None +# bedGraph +# wiggle +# 2nd word: +# read1_5p : signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc +# read2 : signal from only 2nd read +# +#outWigStrand Stranded +# string: strandedness of wigglle (bedGraph) output +# Stranded: separate strands, str1 and str2 +# Unstranded: collapsed strands +# +#outWigReferencesPrefix - +# string: prefix matching reference names to include in the output wiggle file, e.g. "chr.*" +# default: "-" - include all references +# +#outWigNorm RPM +# string: type of normalization for the signal +# RPM : reads per million of mapped reads +# None : no normalization, "raw" counts +# + + +### OUTPUT FILTERING +#outFilterType Normal +# string: type of filtering +# Normal: standard filtering using only current alignment +# BySJout: keep only those reads that contain junctions that passed filtering into SJ.out.tab + +#outFilterMultimapScoreRange 1 +# int: the score range below the maximum score for multimapping alignments. +# as -10*log10(1-1/2) = MapQual for read mapping twice = 3.013, 3.013 -range 3 +# keep all multi aligned reads (at least up to 100 times as -10*log10(1-1/100) = +# 0.04364805)... I think... Not sure I got it right though... + +outFilterMultimapNmax 1 +# int: read alignments will be output only if the read maps fewer than this value, otherwise no alignments will be output + +#outFilterMismatchNmax 10 +# int: alignment will be output only if it has fewer mismatches + +#outFilterMismatchNoverLmax 0.3 +# int: alignment will be output only if its ratio of mismatches to *mapped* length is less than this value; to make it consistent with parmater used with TH + +#outFilterMismatchNoverReadLmax 1 +# int: alignment will be output only if its ratio of mismatches to *read* length is less than this value + +#outFilterScoreMin 0 +# int: alignment will be output only if its score is higher than this value + +#outFilterScoreMinOverLread 0.66 +# float: outFilterScoreMin normalized to read length (sum of mates' lengths for paired-end reads) + +#outFilterMatchNmin 0 +# int: alignment will be output only if the number of matched bases is higher than this value; to make it consistent with parmater used with TH + +#outFilterMatchNminOverLread 0.66 +# float: outFilterMatchNmin normalized to read length (sum of mates' lengths for paired-end reads) +# I guess it has something to do with soft clipping ? I don't know, so I let the default value... + +#outFilterIntronMotifs None +# string: filter alignment using their motifs +# None : no filtering +# RemoveNoncanonical : filter out alignments that contain non-canonical junctions +# RemoveNoncanonicalUnannotated : filter out alignments that contain non-canonical unannotated junctions when using annotated +# splice junctions database. The annotated non-canonical junctions will be kept. + +### OUTPUT FILTERING: SPLICE JUNCTIONS +#outSJfilterReads All +# string: which reads to consider for collapsed splice junctions output +# All: all reads, unique- and multi-mappers +# Unique: uniquely mapping reads only + +outSJfilterOverhangMin 3 1 1 1 +# 4*int: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG motif, +# (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif does not apply to annotated junctions + +#outSJfilterCountUniqueMin 3 1 1 1 +# 4*int: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif, +# (4) AT/AC motif. -1 means no output for that motif. Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions + +#outSJfilterCountTotalMin 3 1 1 1 +# 4*int: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 0 0 0 0 +# 4*int>=0: minimum allowed distance to other junctions' donor/acceptor +# does not apply to annotated junctions + +#outSJfilterIntronMaxVsReadN 50000 100000 200000 +# N*int>=0: maximum gap allowed for junctions supported by 1,2,3...N reads +# i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 +# reads any gap <=alignIntronMax does not apply to annotated junctions + +### SCORING +#scoreGap 0 +# gap open penalty + +#scoreGapNoncan -8 +# non-canonical gap open penalty (in addition to scoreGap) + +#scoreGapGCAG -4 +# GCAG gap open penalty (in addition to scoreGap) + +#scoreGapATAC -8 +# ATAC gap open penalty (in addition to scoreGap) + +#scoreGenomicLengthLog2scale -0.25 +# extra score logarithmically scaled with genomic length of the alignment: -scoreGenomicLengthLog2scale*log2(genomicLength) + +#scoreDelOpen -2 +# deletion open penalty + +#scoreDelBase -2 +# deletion extension penalty per base (in addition to scoreDelOpen) + +#scoreInsOpen -2 +# insertion open penalty + +#scoreInsBase -2 +# insertion extension penalty per base (in addition to scoreInsOpen) + +#scoreStitchSJshift 1 +# maximum score reduction while searching for SJ boundaries inthe stitching step + + +### ALIGNMENT and SEED PARAMETERS +#seedSearchStartLmax 50 +# int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +#seedSearchStartLmaxOverLread 1.0 +# float: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +#seedSearchLmax 0 +# int>=0: defines the maximum length of the seeds, if =0 max seed lengthis infinite + +#seedMultimapNmax 10000 +# int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +#seedPerReadNmax 1000 +# int>0: max number of seeds per read + +#seedPerWindowNmax 50 +# int>0: max number of seeds per window + +#seedNoneLociPerWindow 10 +# int>0: max number of one seed loci per window + +alignIntronMin 11 +# minimum intron size: genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion +# modified specifically for tRNA introns + +#alignIntronMax 0 +# maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +#alignMatesGapMax 0 +# maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +#alignSJoverhangMin 5 +# int>0: minimum overhang (i.e. block size) for spliced alignments + +#alignSJDBoverhangMin 3 +# int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +#alignSplicedMateMapLmin 0 +# int>0: minimum mapped length for a read mate that is spliced + +#alignSplicedMateMapLminOverLmate 0.66 +# float>0: alignSplicedMateMapLmin normalized to mate length + +#alignWindowsPerReadNmax 10000 +# int>0: max number of windows per read + +#alignTranscriptsPerWindowNmax 100 +# int>0: max number of transcripts per window + +#alignTranscriptsPerReadNmax 10000 +# max number of different alignments per read to consider + +alignEndsType EndToEnd +# string: type of read ends alignment +# Local : standard local alignment with soft-clipping allowed +# EndToEnd: force end-to-end read alignment, do not soft-clip + +### SPLICE JUNCTIONS DATABASE PARAMETERS +#sjdbFileChrStartEnd - +# string: path to the file with genomic coordinates (chr start end strand) for the introns +# we will provide it in the command line directly, as the path is gonna depend on the sample +# NO: The annotations can be supplied in the form of splice junctions’ loci or GTF (or GFF3) file AND ipmortantly The annotations have to be supplied at the genome/suffix array generation step + +#sjdbGTFfile - +# string: path to the GTF file with annotations +# we will provide it in the command line directly + +#sjdbGTFchrPrefix - +# string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC geneomes) + +#sjdbGTFfeatureExon exon +# string: feature type in GTF file to be used as exons for building transcripts + +#sjdbGTFtagExonParentTranscript transcript_id +# string: tag name to be used as exons' parents for building transcripts + +#sjdbOverhang 0 +# int>=0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) +# if =0, splice junction database is not used + +#sjdbScore 2 +# int: extra alignment score for alignmets that cross database junctions + +### WINDOWS, ANCHORS, BINNING +#winAnchorMultimapNmax 50 +# int>0: max number of loci anchors are allowed to map to + +#winBinNbits 16 +# int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering,each window will occupy an integer nb of bins. + +#winAnchorDistNbins 9 +# int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +#winFlankNbins 4 +# int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + + +### CHIMERIC ALIGNMENTS +#chimSegmentMin 0 +## int>0: minimum length of chimeric segment length, if ==0, no chimeric output + +#chimScoreMin 0 +## int>0: minimum total (summed) score of the chimeric segments + +#chimScoreDropMax 20 +## int>0: max drop (difference) of chimeric score (the sum of scores of all chimeric segements) from the read length + +#chimScoreSeparation 10 +## int>0: minimum difference (separation) between the best chimeric score and the next one + +#chimScoreJunctionNonGTAG -1 +## int: penalty for a non-GT/AG chimeric junction + +#chimJunctionOverhangMin 20 +## int>0: minimum overhang for a chimeric junction + + +### QUANTIFICATION OF ANNOTATIONS +#quantMode - +# string(s): types of qunatification requested +# - : None +# TranscriptomeSAM : output SAM/BAM alignments to transcriptome into a separate file + + +#### 2-PASS +#twopass1readsN 0 +# int>0: number of reads to process for the 1st step. 0 : 1-step only, no 2nd pass; use very large number to map all reads in the first step +# +#twopassSJlimit 1000000 +# int>=0: maximum number of junction detected in the 1st step +# + diff --git a/data/Parameters.unmapped.in b/data/Parameters.unmapped.in new file mode 120000 index 0000000..a696866 --- /dev/null +++ b/data/Parameters.unmapped.in @@ -0,0 +1 @@ +/project/owlmayer/Applications/21_NETseqStats/config/Parameters.in \ No newline at end of file diff --git a/data/gencode.v28.vM18.primary_assembly.annotation.gff3 b/data/gencode.v28.vM18.primary_assembly.annotation.gff3 new file mode 120000 index 0000000..94363ee --- /dev/null +++ b/data/gencode.v28.vM18.primary_assembly.annotation.gff3 @@ -0,0 +1 @@ +/project/owlmayerTemporary/owlmayer_Applications/01_NetSeqPipeline.v2.spikeIns/data/gencode.v28.vM18.primary_assembly.annotation.gff3 \ No newline at end of file diff --git a/data/masked.bed b/data/masked.bed new file mode 120000 index 0000000..1a6c41d --- /dev/null +++ b/data/masked.bed @@ -0,0 +1 @@ +/project/owlmayer/Applications/04_Pausing_from_NETseq/Snakemake/Data/masked.bed \ No newline at end of file diff --git a/data/maskingClasses.bed b/data/maskingClasses.bed new file mode 120000 index 0000000..866e816 --- /dev/null +++ b/data/maskingClasses.bed @@ -0,0 +1 @@ +/project/owlmayerTemporary/Assembly/Human/GRCh38_p12/Non-nascent/maskingFiles/maskingClasses.bed \ No newline at end of file diff --git a/data/test/test.fastq.gz b/data/test/test.fastq.gz new file mode 100644 index 0000000..86f385e Binary files /dev/null and b/data/test/test.fastq.gz differ diff --git a/runs.sh b/runs.sh new file mode 100644 index 0000000..5af6fc1 --- /dev/null +++ b/runs.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +SOURCE=${BASH_SOURCE[0]} +DIR=$( dirname "$SOURCE" ) + +############# create config test file for your localenvironment, please adjust for custom data + +bash createConfig.sh "test" $DIR/data/test/test.fastq.gz $DIR/config.yaml + +############ +config=$DIR/config.yaml +dr=output +cores=5 +mkdir -p $dr + +snakemake -c$cores --directory $dr --configfile $config + diff --git a/source/createBarcodeDictionary.py b/source/createBarcodeDictionary.py new file mode 100755 index 0000000..1663f42 --- /dev/null +++ b/source/createBarcodeDictionary.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +import pickle + +bLen=snakemake.params[0] +d={} +print(bLen) +with open(snakemake.input[0],'r') as txt: + for line in txt: + counts=int(line.strip().split()[1]) + ln=line.strip().split()[0] + r=ln[bLen:].strip() + if r in d: + if ln[:bLen] in d[r]: + d[r][ln[:bLen]]+=counts + else: + d[r][ln[:bLen]]=counts + else: + d[r]={ln[:bLen]:counts} + +i=1 +sd={} +with open(snakemake.output[1],'w') as f: + for key in d: + sd[i]=d[key] + f.write('>'+str(i)+'\n') + f.write(key+'\n') + i+=1 + +with open(snakemake.output[0],'wb') as f: + pickle.dump(sd,f,pickle.HIGHEST_PROTOCOL) diff --git a/source/createBedgraphs.py b/source/createBedgraphs.py new file mode 100755 index 0000000..9c7bd65 --- /dev/null +++ b/source/createBedgraphs.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +import pickle +import pandas as pd +import numpy as np +from Bio import SeqIO + +def createSI(fn): + ''' + Parameters + ---------- + fn : string + Path to the file with genome annotation. + + Returns + ------- + d : set of tuples (string,int) + Set of splicing intermediates coordinates in form: (chromosome, position). + The sign of the position indicates strand. + ''' + d=[] + with open(fn,'r') as a: + for ln in a: + if ln[0]!='#': + l=ln.strip().split() + if l[2]=='exon': + if l[6]=='+': + d.append((l[0],int(l[4]))) # 3'end of exon + d.append((l[0],int(l[3])-1)) # 3'end of intron + else: + d.append((l[0],-1*(int(l[3])-1))) # 3'end of exon + d.append((l[0],-1*int(l[4]))) # 3'end of intron + d=set(d) + return d + +annotation=snakemake.params[0] +SI=createSI(annotation) +genomefn=snakemake.params[1] +genome=SeqIO.index(genomefn,"fasta") +#barcode length +bLen=snakemake.params[2] +#barcode linker sequence +barcode_linker=snakemake.params[3] +lLen=len(barcode_linker) +d=pickle.load(open(snakemake.input[0],'rb')) +#rt threshold +threshold=snakemake.params[4] + + +chrm='' +reads={} +reads_noPCR={} +info=[] +for line in d: + if line[0]!=chrm: + chrm=line[0] + chrom_sequence=genome[chrm].seq[:] + if line[1]>0: + bseq=str(chrom_sequence[line[1]:line[1]+bLen].reverse_complement()) + lseq=str(chrom_sequence[line[1]+bLen:line[1]+bLen+lLen]) + else: + bseq=str(chrom_sequence[-line[1]-bLen:-line[1]]) + lseq=str(chrom_sequence[-line[1]-bLen-lLen:-line[1]-bLen].reverse_complement()) + stats=np.zeros(bLen+1,dtype=np.int64) + stats_noPCR=np.zeros(bLen+1,dtype=np.int64) + for barcode in d[line]: + s=sum([a0==b0 for a0,b0 in zip(bseq,barcode)]) +barcode.count('N') + stats[s]+=d[line][barcode] + stats_noPCR[s]+=1 + reads[line]=stats + reads_noPCR[line]=stats_noPCR + info.append([line[0],line[1],bseq,lseq]) + +all_pos=snakemake.output[0] +all_neg=snakemake.output[1] +noPCR_pos=snakemake.output[2] +noPCR_neg=snakemake.output[3] +noPCR_noSI_pos=snakemake.output[4] +noPCR_noSI_neg=snakemake.output[5] +PCRtrack_pos=snakemake.output[6] +PCRtrack_neg=snakemake.output[7] +RTtrack_pos=snakemake.output[8] +RTtrack_neg=snakemake.output[9] +noPCR_noSI_noRT_pos=snakemake.output[10] +noPCR_noSI_noRT_neg=snakemake.output[11] +with open(all_pos,'w') as p, open(all_neg,'w') as n, \ + open(noPCR_pos,'w') as ppcrfree, open(noPCR_neg,'w') as npcrfree, \ + open(noPCR_noSI_pos,'w') as pold, open(noPCR_noSI_neg,'w') as nold, \ + open(PCRtrack_pos,'w') as ppcr, open(PCRtrack_neg,'w') as npcr, \ + open(RTtrack_pos,'w') as prt, open(RTtrack_neg,'w') as nrt, \ + open(noPCR_noSI_noRT_pos,'w') as pfinal, open(noPCR_noSI_noRT_neg,'w') as nfinal: + for line in reads: + if line[1]>0: + position=line[0]+'\t'+str(line[1]-1)+'\t'+str(line[1])+'\t' + p.write(position+str(reads[line].sum())+'\n') #all reads mapping to the position + ppcrfree.write(position+str(reads_noPCR[line][:-1].sum())+'\n') #all reads with UMI mapping to the position (once per UMI) + prt.write(position+str(100*reads[line][-1]/reads[line].sum())+'\n') #pct of rt reads at the position + ppcr.write(position+str(100-100*reads_noPCR[line].sum()/reads[line].sum())+'\n') #pct of pcr duplication at the position + if line not in SI: + pold.write(position+str(reads_noPCR[line][:-1].sum())+'\n') + if reads[line][-1]/reads[line].sum()< threshold: + pfinal.write(position+str(reads_noPCR[line][:-1].sum())+'\n') + else: + position=line[0]+'\t'+str(-line[1])+'\t'+str(-line[1]+1)+'\t' + n.write(position+str(reads[line].sum())+'\n') + npcrfree.write(position+str(reads_noPCR[line][:-1].sum())+'\n') + nrt.write(position+str(100*reads[line][-1]/reads[line].sum())+'\n') + npcr.write(position+str(100-100*reads_noPCR[line].sum()/reads[line].sum())+'\n') + if line not in SI: + nold.write(position+str(reads_noPCR[line][:-1].sum())+'\n') + if reads[line][-1]/reads[line].sum()< threshold: + nfinal.write(position+str(reads_noPCR[line][:-1].sum())+'\n') diff --git a/source/createSequences.py b/source/createSequences.py new file mode 100755 index 0000000..84f98ba --- /dev/null +++ b/source/createSequences.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +bLen=snakemake.params[0] +d=[] +with open(snakemake.input[0],'r') as txt: + for line in txt: + ln=line.strip().split()[0] + d.append(ln[bLen:].strip()) + +d=list(set(d)) +with open(snakemake.output[0],'w') as f: + for i in range(len(d)): + f.write('>'+str(i)+'\n') + f.write(d[i]+'\n') + diff --git a/source/createStats.py b/source/createStats.py new file mode 100755 index 0000000..a64c81a --- /dev/null +++ b/source/createStats.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import pandas as pd +import pybedtools +import matplotlib.pylab as plt + +fq=snakemake.input[0] +strands={"pos":"+","neg":"-"} +d={"pos":{},"neg":{}} +d["pos"]["all"]=snakemake.input[1] +d["neg"]["all"]=snakemake.input[2] +d["pos"]["noPCR"]=snakemake.input[3] +d["neg"]["noPCR"]=snakemake.input[4] +d["pos"]["noPCR_noSI"]=snakemake.input[5] +d["neg"]["noPCR_noSI"]=snakemake.input[6] + +mfile=snakemake.params[0] +prefix=snakemake.params[1] +downstream=snakemake.params[2] + +b={"pos":{},"neg":{}} +for s in d.keys(): + for c in d[s].keys(): + b[s][c]=pybedtools.BedTool(d[s][c]) + +masking=pd.read_csv(mfile,sep='\t',header=None) +masking[1]=masking.apply(lambda x: x[1]-downstream if (x[5]=='-' and x[0]!='chrM') else x[1],1) +masking[2]=masking.apply(lambda x: x[2]+downstream if x[5]=='+' else x[2],1) + +classes=['PolM','PolI','PolIII','snRNA','snoRNA','miRNA'] +stats=pd.DataFrame(0, index=["unmapped","allMapped"]+classes+["spikeIns","informative\nReads"], columns=["all","noPCR","noPCR_noSI"]) + +#allReads +l=0 +with open(fq,'r') as f: + for ln in f: + l+=1 +allReads=l/4 + +#allMapped +for s in b.keys(): + for t in b[s].keys(): + temp_df=b[s][t].to_dataframe() + stats.loc["allMapped"][t]+=temp_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum() + +#classes +for c in classes: + for s in b.keys(): + for t in b[s].keys(): + temp_masking=masking[masking[3]==c] + temp_masking=temp_masking[temp_masking[5]==strands[s]] + masking_bed=pybedtools.BedTool.from_dataframe(temp_masking) + masking_bed=masking_bed.merge(s=True,c=[4,5,6],o='distinct') + temp_bed=b[s][t].intersect(masking_bed) + temp_df=temp_bed.to_dataframe() + stats.loc[c][t]+=temp_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum() + b[s][t]=b[s][t].subtract(masking_bed) + +#spikeins and rest +for s in b.keys(): + for t in b[s].keys(): + temp_df=b[s][t].to_dataframe() + spikein_df=temp_df[temp_df['chrom'].str.startswith(prefix)] + stats.loc["spikeIns"][t]+=spikein_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum() + temp_df=temp_df[~temp_df['chrom'].str.startswith(prefix)] + stats.loc["informative\nReads"][t]+=temp_df.apply(lambda x: x['name']*(x['end']-x['start']),1).sum() + + +stats['PCR duplicates']=stats['all']-stats["noPCR"] +stats['Splicing Intermediates']=stats['noPCR']-stats["noPCR_noSI"] +stats['Unique']=stats["noPCR_noSI"] +stats.loc["unmapped"]["Unique"]=allReads-stats.loc["allMapped"]["all"] + +totals=stats[["Unique","Splicing Intermediates","PCR duplicates"]] +totals=totals[totals.index!="allMapped"] +percents=totals*100/allReads + +totals.to_csv(snakemake.output[0],sep='\t') + +totals.plot.bar(stacked=True) +plt.ylabel("Number of reads") +plt.tight_layout() +plt.savefig(snakemake.output[1]) + +percents.plot.bar(stacked=True) +plt.ylabel("% of reads") +plt.tight_layout() +plt.savefig(snakemake.output[2]) diff --git a/source/maskRegion.py b/source/maskRegion.py new file mode 100644 index 0000000..881b21d --- /dev/null +++ b/source/maskRegion.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +import pybedtools +import pandas as pd + +ps = pybedtools.BedTool(snakemake.input[0]) +if '.pos.' in snakemake.input[0]: + strand='+' +elif '.neg.' in snakemake.input[0]: + strand='-' + +if snakemake.params[0]=='': + df=ps +else: + frame_mf=pd.read_csv(snakemake.params[0],delimiter='\t',header=None) + frame_mf=frame_mf[frame_mf[5]==strand] + mf = pybedtools.BedTool.from_dataframe(frame_mf) + df = ps.subtract(mf) +df.moveto(snakemake.output[0]) + diff --git a/source/normalizeSpikeIn.py b/source/normalizeSpikeIn.py new file mode 100755 index 0000000..6d190b3 --- /dev/null +++ b/source/normalizeSpikeIn.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import pandas as pd + +spikes=snakemake.input[0] +print(spikes) +aligned=0 +aligned+=pd.read_csv(snakemake.input[0],delimiter='\t',header=None)[3].sum() +aligned+=pd.read_csv(snakemake.input[1],delimiter='\t',header=None)[3].sum() + +data=pd.read_csv(snakemake.input[2],delimiter='\t',header=None) +data[3]=data[3]/(aligned/1000000) + +data.to_csv(snakemake.output[0],sep='\t',header=False,index=False) +with open(snakemake.output[1],'w') as f: + f.write(str(aligned)) diff --git a/source/parsePositions.py b/source/parsePositions.py new file mode 100755 index 0000000..0d96d36 --- /dev/null +++ b/source/parsePositions.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +import pysam +import pickle + +rcol=snakemake.input[0] +iBAM = pysam.Samfile(snakemake.input[1], 'r') +bLen=snakemake.params[0] + +d=pickle.load(open(rcol, "rb" )) + + +NETseq={} +for line in iBAM: + if line.flag==0: + if (line.reference_name,int(-line.reference_start)) in NETseq: + for barcode in d[int(line.qname)]: + if barcode in NETseq[(line.reference_name,int(-line.reference_start))]: + NETseq[(line.reference_name,int(-line.reference_start))][barcode]+=d[int(line.qname)][barcode] + else: + NETseq[(line.reference_name,int(-line.reference_start))][barcode]=d[int(line.qname)][barcode] + else: + NETseq[(line.reference_name,int(-line.reference_start))]=d[int(line.qname)] + elif line.flag==16: + if (line.reference_name,int(line.reference_end)) in NETseq: + for barcode in d[int(line.qname)]: + if barcode in NETseq[(line.reference_name,int(line.reference_end))]: + NETseq[(line.reference_name,int(line.reference_end))][barcode]+=d[int(line.qname)][barcode] + else: + NETseq[(line.reference_name,int(line.reference_end))][barcode]=d[int(line.qname)][barcode] + else: + NETseq[(line.reference_name,int(line.reference_end))]=d[int(line.qname)] +# close file +iBAM.close() +with open(snakemake.output[0],'wb') as f: + pickle.dump(NETseq,f,pickle.HIGHEST_PROTOCOL) +