Snakefile

import os

sampleID = config["sampleID"]
WORKDIR = os.getcwd()
spikeIn = config["spikeIn"]
reference = config["reference"]
mask_bed = config["mask_bed"]

def resultFiles(spikeIn, reference, sampleID):
    if spikeIn=="no" :
        bigwigs=expand("BigWig/{sampleID}/{type}{sampleID}.{strand}.bw", sampleID=sampleID, type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"])
        if mask_bed!='':
            masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
            return bigwigs+masked
        return bigwigs
    else :
        bedgraphs=expand("Bedgraphs/{sampleID}/{ref}/{type}{sampleID}.{strand}.bedGraph", sampleID=sampleID, ref = [reference,spikeIn], type=["","PCRtrack_","RTtrack_"],strand = ["pos","neg"])
        normalized=expand("Results/Normalized_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
        if mask_bed!='':
            masked=expand("Results/Masked_{sampleID}.{strand}.bedGraph", sampleID=sampleID, strand = ["pos","neg"])
            return bedgraphs+normalized+masked
        return bedgraphs+normalized

rule all:
    input:
        resultFiles(spikeIn, reference, sampleID),
        "Stats/"+sampleID+".csv"

rule create_link_fastq:
    params:
        file = config["file"],
        out = WORKDIR+"/Data/" + sampleID + ".fastq." + config["compression"]
    output:
        temp("Data/" + sampleID + ".fastq." + config["compression"])
    threads: 1
    resources:
        memory = 10,
        time = 1
    shell:
        "ln -s {params.file} {params.out}"

rule decompress:
    input:
        "Data/" + sampleID + ".fastq." + config["compression"]
    output:
        temp("TemporaryData/"+sampleID+".fastq")
    params:
        compression = config["compression"]
    resources:
        memory = 50,
        time = 1
    threads: 1
    run:
        if params.compression=="bz2":
            shell("bunzip2 -c -k {input} > {output}")
        elif params.compression=="gz":
            shell("gunzip -c -k {input} > {output}")
        elif params.compression=="no":
            shell("cp {input} {output}")

rule remove_adapter:
    input:
        "TemporaryData/"+sampleID+".fastq"
    output:
        temp("TemporaryData/"+sampleID+"_noAdapter.fastq")
    params:
        cutadapt=config["cutadapt"],
        cutadapt_params=config["cutadapt_params"],
        min_len=config["barcode_len"] + 10
    resources:
        memory = 100,
        time = 4
    threads:
        30
    shell:
        "{params.cutadapt} {params.cutadapt_params} -m {params.min_len} -j {threads} -o {output} {input}"

rule fastqc:
    input:
        "TemporaryData/"+sampleID+"_noAdapter.fastq"
    output:
        "QC/"+sampleID+"_noAdapter_fastqc.html"
    threads:
        1
    params:
        fastqc=config["fastqc"],
        outdir=WORKDIR+"/QC/"
    shell:
        "{params.fastqc} --outdir={params.outdir} -i {input}"

rule collapse_reads:
    input:
        "TemporaryData/"+sampleID+"_noAdapter.fastq"
    output:
        "TemporaryData/"+sampleID+"_collapsed.txt"
    params:
        starcode=config["starcode"],
        starcode_params=config["starcode_params"]
    resources:
        memory = 800,
        time = 8
    threads:
        10
    shell:
        "{params.starcode} {params.starcode_params} -t {threads} -o {output} -i {input}"

rule barcode_dictionary:
    input:
        "TemporaryData/"+sampleID+"_collapsed.txt"
    output:
        "TemporaryData/"+sampleID+"_barcodes.pickle",
        temp("TemporaryData/"+sampleID+"_noBC.fasta")
    resources:
        memory = 400,
        time = 4
    threads:
        1
    params:
        barcode_len=config["barcode_len"]
    script:
        "source/createBarcodeDictionary.py"

rule map_reads:
    input:
        reads="TemporaryData/"+sampleID+"_noBC.fasta"
    output:
        temp("TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam")
    params:
        input="TemporaryData/"+sampleID+"_noBC.fasta",
        star=config["star"],
        params=config["star_params"],
        star_index=config["star_index"],
        output_prefix="TemporaryData/Mapping/"+sampleID+"_"
    resources:
        memory = 200,
        time = 4
    threads:
        50
    shell:
        "{params.star} --outSAMtype BAM Unsorted --genomeDir {params.star_index} --readFilesIn {params.input} --parametersFiles {params.params} --runThreadN {threads} --outFileNamePrefix {params.output_prefix}"

rule sort_bam:
    input:
        "TemporaryData/Mapping/"+sampleID+"_Aligned.out.bam"
    output:
        "TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam"
    params:
        samtools=config["samtools"],
    resources:
        memory = 200,
        time = 4
    threads:
        10
    shell:
        "{params.samtools} sort -@ {threads} -o {output} {input}"

rule parse_positions:
    input:
        "TemporaryData/"+sampleID+"_barcodes.pickle",
        "TemporaryData/Mapping/"+sampleID+"_Aligned.sorted.bam"
    output:
        "TemporaryData/"+sampleID+"_posDict.pickle"
    params:
        barcode_len=config["barcode_len"],
    resources:
        memory = 300,
        time = 4
    threads:
        1
    script:
        "source/parsePositions.py"

rule create_bedgraphs:
    input:
        "TemporaryData/"+sampleID+"_posDict.pickle",
    output:
        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+"PCRtrack_"+sampleID+".neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+"RTtrack_"+sampleID+".neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+".pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+".neg.bedGraph"
    params:
        annotation=config["annotation"],
        genome=config["genome_fa"],
        barcode_len=config["barcode_len"],
        barcode_linker=config["barcode_linker"],
        threshold=0.05
    resources:
        memory = 300,
        time = 4
    threads:
        1
    script:
        "source/createBedgraphs.py"

rule create_stats:
    input:
        "TemporaryData/"+sampleID+".fastq",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_allReads.neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR.neg.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.pos.bedGraph",
        "Bedgraphs/"+sampleID+"/"+sampleID+"_noPCR_noSI.neg.bedGraph"
    output:
        "Stats/"+sampleID+".csv",
        "Stats/Number_"+sampleID+".pdf",
        "Stats/Percent_"+sampleID+".pdf"
    params:
        config["stats_bed"],
        config["prefix"],
        config["mask_downstream"]
    resources:
        memory = 100,
        time = 10,
        temp_memory = 100
    threads:
        1
    script:
        "source/createStats.py"

rule create_bw:
    input:
        bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph"
    output:
        bw = "BigWig/"+sampleID+"/{X}.bw"
    resources:
        memory = 100,
        time = 1
    params:
        bedGraphToBigWig = config["bedGraphToBigWig"],
        chrLen = config["star_index"] + "chrNameLength.txt"
    threads:
        1
    shell:
        """
        sort -k1,1 -k2,2n {input.bg} > {input.bg}.temp
    	{params.bedGraphToBigWig} {input.bg}.temp {params.chrLen} {output.bw}
        rm -f {input.bg}.temp
        """

rule splitBedgraph:
    params:
        prefix = config["prefix"],
    input:
        bg = "Bedgraphs/"+sampleID+"/{X}.bedGraph"
    output:
        reference_bg = "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph",
        spikeIn_bg = "Bedgraphs/"+sampleID+"/"+spikeIn+"/{X}.bedGraph"
    resources:
        memory = 10,
        time = 1
    shell:
        """
        grep -v {params.prefix} {input.bg} > {output.reference_bg}
        grep {params.prefix} {input.bg} | sed "s/^{params.prefix}//g" > {output.spikeIn_bg}
        """

rule normalize_with_spikeIn:
    input:
        local_spike=expand("Bedgraphs/"+sampleID+"/"+spikeIn+"/"+sampleID+".{strand}.bedGraph", strand = ["pos","neg"]),
        local_sample="Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph"
    output:
        "Results/Normalized_{X}.bedGraph",
        "Results/factor_{X}.txt"
    resources:
        memory = 100,
        time = 1
    threads:
        1
    script:
        "source/normalizeSpikeIn.py"

rule maskRegions:
    input:
        "Bedgraphs/"+sampleID+"/"+reference+"/{X}.bedGraph"
    output:
        "Results/Masked_{X}.bedGraph"
    resources:
        memory = 100,
        time = 1
    threads:
        1
    params:
        config["mask_bed"] if config["mask_bed"]!="" else ""
    script:
        "source/maskRegion.py"