added all pipeline scripts and data

MayerGroup · Feb 21, 2024 · d9a3416 · d9a3416
1 parent 867b708
commit d9a3416
Show file tree

Hide file tree

Showing 19 changed files with 1,242 additions and 0 deletions.
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,294 @@
+import os
+
+sampleID = config["sampleID"]
+WORKDIR = os.getcwd()
+spikeIn = config["spikeIn"]
+reference = config["reference"]
+mask_bed = config["mask_bed"]
+
+def resultFiles(spikeIn, reference, sampleID):
+    if spikeIn=="no" :
+        bigwigs=expand("BigWig/{sampleID}/{type}{sampleID}.{strand}.bw", sampleID=sampleID, type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"])
+        if mask_bed!='':
+            masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
+            return bigwigs+masked
+        return bigwigs
+    else :
+        bedgraphs=expand("Bedgraphs/{sampleID}/{ref}/{type}{sampleID}.{strand}.bedGraph", sampleID=sampleID, ref = [reference,spikeIn], type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"])
+        normalized=expand("Results/Normalized_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
+        if mask_bed!='':
+            masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
+            return bedgraphs+normalized+masked
+        return bedgraphs+normalized
+
+rule all:
+    input:
+        resultFiles(spikeIn, reference, sampleID),
+        "Stats/"+sampleID+".csv"
+
+rule create_link_fastq:
+    params:
+        file = config["file"],
+        out = WORKDIR+"/Data/" + sampleID + ".fastq." + config["compression"]
+    output:
+        temp("Data/" + sampleID + ".fastq." + config["compression"])
+    threads: 1
+    resources:
+        memory = 10,
+        time = 1
+    shell:
+        "ln -s {params.file} {params.out}"
+
+rule decompress:
+    input:
+        "Data/" + sampleID + ".fastq." + config["compression"]
+    output:
+        temp("TemporaryData/"+sampleID+".fastq")
+    params:
+        compression = config["compression"]
+    resources:
+        memory = 50,
+        time = 1
+    threads: 1
+    run:
+        if params.compression=="bz2":
+            shell("bunzip2 -c -k {input} > {output}")
+        elif params.compression=="gz":
+            shell("gunzip -c -k {input} > {output}")
+        elif params.compression=="no":
+            shell("cp {input} {output}")
+
+rule remove_adapter:
+    input:
+        "TemporaryData/"+sampleID+".fastq"
+    output:
+        temp("TemporaryData/"+sampleID+"_noAdapter.fastq")
+    params:
+        cutadapt=config["cutadapt"],
+        cutadapt_params=config["cutadapt_params"],
+        min_len=config["barcode_len"] + 10
+    resources:
+        memory = 100,
+        time = 4
+    threads:
+        30
+    shell:
+        "{params.cutadapt} {params.cutadapt_params} -m {params.min_len} -j {threads} -o {output} {input}"
+
+rule fastqc:
+    input:
+        "TemporaryData/"+sampleID+"_noAdapter.fastq"
+    output:
+        "QC/"+sampleID+"_noAdapter_fastqc.html"
+    threads:
+        1
+    params:
+        fastqc=config["fastqc"],
+        outdir=WORKDIR+"/QC/"
+    shell:
+        "{params.fastqc} --outdir={params.outdir} -i {input}"
+
+rule collapse_reads:
+    input:
+        "TemporaryData/"+sampleID+"_noAdapter.fastq"
+    output:
+        "TemporaryData/"+sampleID+"_collapsed.txt"
+    params:
+        starcode=config["starcode"],
+        starcode_params=config["starcode_params"]
+    resources:
+        memory = 800,
+        time = 8
+    threads:
+        10
+    shell:
+        "{params.starcode} {params.starcode_params} -t {threads} -o {output} -i {input}"
+
+rule barcode_dictionary:
+    input:
+        "TemporaryData/"+sampleID+"_collapsed.txt"
+    output:
+        "TemporaryData/"+sampleID+"_barcodes.pickle",
+        temp("TemporaryData/"+sampleID+"_noBC.fasta")
+    resources:
+        memory = 400,
+        time = 4
+    threads:
+        1
+    params:
+        barcode_len=config["barcode_len"]
+    script:
+        "source/createBarcodeDictionary.py"
+
+rule map_reads:
+    input:
+        reads="TemporaryData/"+sampleID+"_noBC.fasta"
+    output:
+        temp("TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam")
+    params:
+        input="TemporaryData/"+sampleID+"_noBC.fasta",
+        star=config["star"],
+        params=config["star_params"],
+        star_index=config["star_index"],
+        output_prefix="TemporaryData/Mapping/"+sampleID+"_"
+    resources:
+        memory = 200,
+        time = 4
+    threads:
+        50
+    shell:
+        "{params.star} --outSAMtype BAM Unsorted --genomeDir {params.star_index} --readFilesIn {params.input} --parametersFiles {params.params} --runThreadN {threads} --outFileNamePrefix {params.output_prefix}"
+
+rule sort_bam:
+    input:
+        "TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam"
+    output:
+        "TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam"
+    params:
+        samtools=config["samtools"],
+    resources:
+        memory = 200,
+        time = 4
+    threads:
+        10
+    shell:
+        "{params.samtools} sort -@ {threads} -o {output} {input}"
+
+rule parse_positions:
+    input:
+        "TemporaryData/"+sampleID+"_barcodes.pickle",
+        "TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam"
+    output:
+        "TemporaryData/"+sampleID+"_posDict.pickle"
+    params:
+        barcode_len=config["barcode_len"],
+    resources:
+        memory = 300,
+        time = 4
+    threads:
+        1
+    script:
+        "source/parsePositions.py"
+
+rule create_bedgraphs:
+    input:
+        "TemporaryData/"+sampleID+"_posDict.pickle",
+    output:
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+".pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+".neg.bedGraph"
+    params:
+        annotation=config["annotation"],
+        genome=config["genome_fa"],
+        barcode_len=config["barcode_len"],
+        barcode_linker=config["barcode_linker"],
+        threshold=0.05
+    resources:
+        memory = 300,
+        time = 4
+    threads:
+        1
+    script:
+        "source/createBedgraphs.py"
+
+rule create_stats:
+    input:
+        "TemporaryData/"+sampleID+".fastq",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph",
+        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph"
+    output:
+        "Stats/"+sampleID+".csv",
+        "Stats/Number_"+sampleID+".pdf",
+        "Stats/Percent_"+sampleID+".pdf"
+    params:
+        config["stats_bed"],
+        config["prefix"],
+        config["mask_downstream"]
+    resources:
+        memory = 100,
+        time = 10,
+        temp_memory = 100
+    threads:
+        1
+    script:
+        "source/createStats.py"
+
+rule create_bw:
+    input:
+        bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph"
+    output:
+        bw = "BigWig/"+sampleID+"/{X}.bw"
+    resources:
+        memory = 100,
+        time = 1
+    params:
+        bedGraphToBigWig = config["bedGraphToBigWig"],
+        chrLen = config["star_index"] + "chrNameLength.txt"
+    threads:
+        1
+    shell:
+        """
+        sort -k1,1 -k2,2n {input.bg} > {input.bg}.temp
+    	{params.bedGraphToBigWig} {input.bg}.temp {params.chrLen} {output.bw}
+        rm -f {input.bg}.temp
+        """
+
+rule splitBedgraph:
+    params:
+        prefix = config["prefix"],
+    input:
+        bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph"
+    output:
+        reference_bg = "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph",
+        spikeIn_bg = "Bedgraphs/"+sampleID+"/"+spikeIn+"/{X}.bedGraph"
+    resources:
+        memory = 10,
+        time = 1
+    shell:
+        """
+        grep -v {params.prefix} {input.bg} > {output.reference_bg}
+        grep {params.prefix} {input.bg} | sed "s/^{params.prefix}//g" > {output.spikeIn_bg}
+        """
+
+rule normalize_with_spikeIn:
+    input:
+        local_spike=expand("Bedgraphs/"+sampleID+"/"+spikeIn+"/"+sampleID+".{strand}.bedGraph", strand = ["pos","neg"]),
+        local_sample="Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph"
+    output:
+        "Results/Normalized_{X}.bedGraph",
+        "Results/factor_{X}.txt"
+    resources:
+        memory = 100,
+        time = 1
+    threads:
+        1
+    script:
+        "source/normalizeSpikeIn.py"
+
+rule maskRegions:
+    input:
+        "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph"
+    output:
+        "Results/Masked_{X}.bedGraph"
+    resources:
+        memory = 100,
+        time = 1
+    threads:
+        1
+    params:
+        config["mask_bed"] if config["mask_bed"]!="" else ""
+    script:
+        "source/maskRegion.py"
diff --git a/config_example.yaml b/config_example.yaml
@@ -0,0 +1,52 @@
+
+# sample ids
+sampleID : test
+
+# InputFile:
+file: /project/ReadThrough/HiS-NET-Seq_pipeline/data/test/test.fastq.gz
+
+reference: GRCh38.p12
+spikeIn: GRCm38.p6
+prefix: Mus_musculus_
+
+# compression type, gz or bz2
+compression: gz
+
+barcode_len: 10
+
+barcode_linker: CTGTAGGCACCATCAAT
+
+# star path
+star : /project/owlmayer/Applications/STAR-2.7.3a/bin/Linux_x86_64/STAR
+
+# star index path
+star_index : /project/ReadThrough/HiS-NET-Seq_pipeline/data/GRCh38_p12_GRCm38_p6
+
+star_params: /project/ReadThrough/HiS-NET-Seq_pipeline/data/Parameters.in
+
+annotation: /project/ReadThrough/HiS-NET-Seq_pipeline/data/gencode.v28.vM18.primary_assembly.annotation.gff3
+
+genome_fa: /project/ReadThrough/HiS-NET-Seq_pipeline/data/GRCh38.p12.GRCm38.p6.primary_assembly.genome.fa
+
+#FastQC
+fastqc : /usr/local/package/bin/fastqc
+
+# samtools
+samtools : /usr/local/package/bin/samtools
+
+starcode: /project/owlmayer/Applications/starcode/starcode
+
+starcode_params: "-d 0"
+
+cutadapt: /usr/local/package/bin/cutadapt
+
+cutadapt_params: "-a ATCTCGTATGCCGTCTTCTGCTTG -a AAAAAAAAAAGGGGGGGGGGGGGG -a GGGGGGGGGGGGGGGGGGGGGGG -e 0.2 -q 5 --max-n 0.9"
+
+bedGraphToBigWig : /usr/local/package/bin/bedGraphToBigWig
+
+mask_bed: /project/ReadThrough/HiS-NET-Seq_pipeline/data/masked.bed
+
+star_params_unmapped: /project/ReadThrough/HiS-NET-Seq_pipeline/data/Parameters.unmappedin
+stats_bed: /project/ReadThrough/HiS-NET-Seq_pipeline/data/maskingClasses.bed
+
+mask_downstream: 300