Skip to content

Commit

Permalink
added all pipeline scripts and data
Browse files Browse the repository at this point in the history
  • Loading branch information
Annkatrin Bressin committed Feb 21, 2024
1 parent 867b708 commit d9a3416
Show file tree
Hide file tree
Showing 19 changed files with 1,242 additions and 0 deletions.
294 changes: 294 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
import os

sampleID = config["sampleID"]
WORKDIR = os.getcwd()
spikeIn = config["spikeIn"]
reference = config["reference"]
mask_bed = config["mask_bed"]

def resultFiles(spikeIn, reference, sampleID):
if spikeIn=="no" :
bigwigs=expand("BigWig/{sampleID}/{type}{sampleID}.{strand}.bw", sampleID=sampleID, type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"])
if mask_bed!='':
masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
return bigwigs+masked
return bigwigs
else :
bedgraphs=expand("Bedgraphs/{sampleID}/{ref}/{type}{sampleID}.{strand}.bedGraph", sampleID=sampleID, ref = [reference,spikeIn], type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"])
normalized=expand("Results/Normalized_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
if mask_bed!='':
masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
return bedgraphs+normalized+masked
return bedgraphs+normalized

rule all:
input:
resultFiles(spikeIn, reference, sampleID),
"Stats/"+sampleID+".csv"

rule create_link_fastq:
params:
file = config["file"],
out = WORKDIR+"/Data/" + sampleID + ".fastq." + config["compression"]
output:
temp("Data/" + sampleID + ".fastq." + config["compression"])
threads: 1
resources:
memory = 10,
time = 1
shell:
"ln -s {params.file} {params.out}"

rule decompress:
input:
"Data/" + sampleID + ".fastq." + config["compression"]
output:
temp("TemporaryData/"+sampleID+".fastq")
params:
compression = config["compression"]
resources:
memory = 50,
time = 1
threads: 1
run:
if params.compression=="bz2":
shell("bunzip2 -c -k {input} > {output}")
elif params.compression=="gz":
shell("gunzip -c -k {input} > {output}")
elif params.compression=="no":
shell("cp {input} {output}")

rule remove_adapter:
input:
"TemporaryData/"+sampleID+".fastq"
output:
temp("TemporaryData/"+sampleID+"_noAdapter.fastq")
params:
cutadapt=config["cutadapt"],
cutadapt_params=config["cutadapt_params"],
min_len=config["barcode_len"] + 10
resources:
memory = 100,
time = 4
threads:
30
shell:
"{params.cutadapt} {params.cutadapt_params} -m {params.min_len} -j {threads} -o {output} {input}"

rule fastqc:
input:
"TemporaryData/"+sampleID+"_noAdapter.fastq"
output:
"QC/"+sampleID+"_noAdapter_fastqc.html"
threads:
1
params:
fastqc=config["fastqc"],
outdir=WORKDIR+"/QC/"
shell:
"{params.fastqc} --outdir={params.outdir} -i {input}"

rule collapse_reads:
input:
"TemporaryData/"+sampleID+"_noAdapter.fastq"
output:
"TemporaryData/"+sampleID+"_collapsed.txt"
params:
starcode=config["starcode"],
starcode_params=config["starcode_params"]
resources:
memory = 800,
time = 8
threads:
10
shell:
"{params.starcode} {params.starcode_params} -t {threads} -o {output} -i {input}"

rule barcode_dictionary:
input:
"TemporaryData/"+sampleID+"_collapsed.txt"
output:
"TemporaryData/"+sampleID+"_barcodes.pickle",
temp("TemporaryData/"+sampleID+"_noBC.fasta")
resources:
memory = 400,
time = 4
threads:
1
params:
barcode_len=config["barcode_len"]
script:
"source/createBarcodeDictionary.py"

rule map_reads:
input:
reads="TemporaryData/"+sampleID+"_noBC.fasta"
output:
temp("TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam")
params:
input="TemporaryData/"+sampleID+"_noBC.fasta",
star=config["star"],
params=config["star_params"],
star_index=config["star_index"],
output_prefix="TemporaryData/Mapping/"+sampleID+"_"
resources:
memory = 200,
time = 4
threads:
50
shell:
"{params.star} --outSAMtype BAM Unsorted --genomeDir {params.star_index} --readFilesIn {params.input} --parametersFiles {params.params} --runThreadN {threads} --outFileNamePrefix {params.output_prefix}"

rule sort_bam:
input:
"TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam"
output:
"TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam"
params:
samtools=config["samtools"],
resources:
memory = 200,
time = 4
threads:
10
shell:
"{params.samtools} sort -@ {threads} -o {output} {input}"

rule parse_positions:
input:
"TemporaryData/"+sampleID+"_barcodes.pickle",
"TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam"
output:
"TemporaryData/"+sampleID+"_posDict.pickle"
params:
barcode_len=config["barcode_len"],
resources:
memory = 300,
time = 4
threads:
1
script:
"source/parsePositions.py"

rule create_bedgraphs:
input:
"TemporaryData/"+sampleID+"_posDict.pickle",
output:
"Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+".pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+".neg.bedGraph"
params:
annotation=config["annotation"],
genome=config["genome_fa"],
barcode_len=config["barcode_len"],
barcode_linker=config["barcode_linker"],
threshold=0.05
resources:
memory = 300,
time = 4
threads:
1
script:
"source/createBedgraphs.py"

rule create_stats:
input:
"TemporaryData/"+sampleID+".fastq",
"Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph",
"Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph"
output:
"Stats/"+sampleID+".csv",
"Stats/Number_"+sampleID+".pdf",
"Stats/Percent_"+sampleID+".pdf"
params:
config["stats_bed"],
config["prefix"],
config["mask_downstream"]
resources:
memory = 100,
time = 10,
temp_memory = 100
threads:
1
script:
"source/createStats.py"

rule create_bw:
input:
bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph"
output:
bw = "BigWig/"+sampleID+"/{X}.bw"
resources:
memory = 100,
time = 1
params:
bedGraphToBigWig = config["bedGraphToBigWig"],
chrLen = config["star_index"] + "chrNameLength.txt"
threads:
1
shell:
"""
sort -k1,1 -k2,2n {input.bg} > {input.bg}.temp
{params.bedGraphToBigWig} {input.bg}.temp {params.chrLen} {output.bw}
rm -f {input.bg}.temp
"""

rule splitBedgraph:
params:
prefix = config["prefix"],
input:
bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph"
output:
reference_bg = "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph",
spikeIn_bg = "Bedgraphs/"+sampleID+"/"+spikeIn+"/{X}.bedGraph"
resources:
memory = 10,
time = 1
shell:
"""
grep -v {params.prefix} {input.bg} > {output.reference_bg}
grep {params.prefix} {input.bg} | sed "s/^{params.prefix}//g" > {output.spikeIn_bg}
"""

rule normalize_with_spikeIn:
input:
local_spike=expand("Bedgraphs/"+sampleID+"/"+spikeIn+"/"+sampleID+".{strand}.bedGraph", strand = ["pos","neg"]),
local_sample="Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph"
output:
"Results/Normalized_{X}.bedGraph",
"Results/factor_{X}.txt"
resources:
memory = 100,
time = 1
threads:
1
script:
"source/normalizeSpikeIn.py"

rule maskRegions:
input:
"Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph"
output:
"Results/Masked_{X}.bedGraph"
resources:
memory = 100,
time = 1
threads:
1
params:
config["mask_bed"] if config["mask_bed"]!="" else ""
script:
"source/maskRegion.py"
52 changes: 52 additions & 0 deletions config_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

# sample ids
sampleID : test

# InputFile:
file: /project/ReadThrough/HiS-NET-Seq_pipeline/data/test/test.fastq.gz

reference: GRCh38.p12
spikeIn: GRCm38.p6
prefix: Mus_musculus_

# compression type, gz or bz2
compression: gz

barcode_len: 10

barcode_linker: CTGTAGGCACCATCAAT

# star path
star : /project/owlmayer/Applications/STAR-2.7.3a/bin/Linux_x86_64/STAR

# star index path
star_index : /project/ReadThrough/HiS-NET-Seq_pipeline/data/GRCh38_p12_GRCm38_p6

star_params: /project/ReadThrough/HiS-NET-Seq_pipeline/data/Parameters.in

annotation: /project/ReadThrough/HiS-NET-Seq_pipeline/data/gencode.v28.vM18.primary_assembly.annotation.gff3

genome_fa: /project/ReadThrough/HiS-NET-Seq_pipeline/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa

#FastQC
fastqc : /usr/local/package/bin/fastqc

# samtools
samtools : /usr/local/package/bin/samtools

starcode: /project/owlmayer/Applications/starcode/starcode

starcode_params: "-d 0"

cutadapt: /usr/local/package/bin/cutadapt

cutadapt_params: "-a ATCTCGTATGCCGTCTTCTGCTTG -a AAAAAAAAAAGGGGGGGGGGGGGG -a GGGGGGGGGGGGGGGGGGGGGGG -e 0.2 -q 5 --max-n 0.9"

bedGraphToBigWig : /usr/local/package/bin/bedGraphToBigWig

mask_bed: /project/ReadThrough/HiS-NET-Seq_pipeline/data/masked.bed

star_params_unmapped: /project/ReadThrough/HiS-NET-Seq_pipeline/data/Parameters.unmappedin
stats_bed: /project/ReadThrough/HiS-NET-Seq_pipeline/data/maskingClasses.bed

mask_downstream: 300
Loading

0 comments on commit d9a3416

Please sign in to comment.