From af7a558b8d18514339ae5a53429f0ead312b2573 Mon Sep 17 00:00:00 2001 From: Jens Preussner Date: Tue, 3 Jul 2018 13:07:03 +0200 Subject: [PATCH] Initial commit --- config/default.yml | 84 +++++++++++++++++++++++++++++++++++++++++++++ environment.yml | 18 ++++++++++ src/reference.snake | 36 +++++++++++++++++++ target-dnaseq.snake | 49 ++++++++++++++++++++++++++ 4 files changed, 187 insertions(+) create mode 100644 config/default.yml create mode 100644 environment.yml create mode 100644 src/reference.snake create mode 100644 target-dnaseq.snake diff --git a/config/default.yml b/config/default.yml new file mode 100644 index 0000000..8b171f8 --- /dev/null +++ b/config/default.yml @@ -0,0 +1,84 @@ + +# +# The action section +# +# clean - whether or not to skip the prinseq cleaning step (see below). +# snv - whether or not to perform SNV analysis. +# cnv - whether or not to perform CNV analysis. +# container - whether or not to create a RDS container. +# +action: + clean: True + snv: True + cnv: True + container: True + +# +# The samplesheet section +# +# file - The path to the SampleSheet text file. Set to None if no samplesheet is available +# index - The name of the Column that should be used as index +# case - The name of the Column containing the case identifier to which the index belongs to +# +samplesheet: + file: 'SampleSheet.txt' + index: 'Sample' + case: 'Case' + +# +# The data section +# +# Contains named dictionaries with input sequencing files: +# r1 - The fastq file containing R1. +# r2 - The fastq file containing R2. +# +data: + sample1: + r1: '' + r2: '' + +# +# The reference section +# +# URL - The URL from where the reference genome can be downloaded. Wildcards are allowed in curly brackets +# organism - Wildcard for the reference organism +# release - Wildcard for the reference release +# file - Wildcard(s) for the reference files that should be merged into the final reference transcriptome +# target - The path to a bed file with coordinates of the target regions +# +reference: + annotation_URL: 'ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/gencode.v{release}.{file}.fa.gz' + genome_URL: 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_{organism}/release_{release}/{assembly}.primary_assembly.genome.fa.gz' + organism: 'mouse' + release: 'M17' + assembly: 'GRCm38' + file: + - basic_annotation + target: 'path/to/target_regions.bed' + +# +# The directory section +# +# ref - directory name for reference files +# fastq - directory name for fastq files +# bam - directory name for STAR alignments +# cnv - directory name for Copy Number Variation analysis +# snv - directory name for Sequence Nucleotide Variation analysis +# tables - directory name for aggregated tables +# log - directory name for program logs +# R - directory name for Rdata objects +# +dirs: + ref: 'ref' + fastq: 'fastq' + bam: 'bam' + cnv: 'cnv' + snv: 'snv' + tables: 'tables' + log: 'log' + R: 'rds' + +# +# The debug section +# +debug: False diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..561984c --- /dev/null +++ b/environment.yml @@ -0,0 +1,18 @@ +name: target-dnaseq + +channels: + - bioconda + - conda-forge + +dependencies: + - python=3.5 + - snakemake + - trimmomatic + - star + - samtools + - biobambam + - qualimap + - varscan + - vcfanno + - vcflib + - control-freec diff --git a/src/reference.snake b/src/reference.snake new file mode 100644 index 0000000..31af219 --- /dev/null +++ b/src/reference.snake @@ -0,0 +1,36 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +''' +@author: jpreuss +Provides rules for reference related modifications +''' + +from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider +FTP = FTPRemoteProvider() + +rule genome_download: + input: + FTP.remote(expand(config['reference']['genome_URL'], **config['reference']), keep_local = True) + output: + GENOME + threads: 1 + message: + 'Downloading gencode genome reference.' + shell: + """ + zcat -f {input} > {output} + """ + +rule annotation_download: + input: + FTP.remote(expand(config['reference']['annotation_URL'], **config['reference']), keep_local = True) + output: + ANNOTATION + threads: 1 + message: + 'Downloading gencode annotation reference.' + shell: + """ + zcat -f {input} > {output} + """ diff --git a/target-dnaseq.snake b/target-dnaseq.snake new file mode 100644 index 0000000..c98f595 --- /dev/null +++ b/target-dnaseq.snake @@ -0,0 +1,49 @@ +import pandas as pd +from os.path import join, basename, dirname + +if workflow.overwrite_configfile != None: + configfile: str(workflow.overwrite_configfile) +else: +configfile: 'config/default.yml' + +#-------------------------------------------------------------------------------# +#-------------- Handle reference related paths and file names ------------------# +#-------------------------------------------------------------------------------# + +GENOME = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['assembly'] + '.fa.gz') +ANNOTATION = join(config['dirs']['ref'], config['reference']['organism'], config['reference']['release'] + '.gtf.gz') + +#-------------------------------------------------------------------------------# +#------------- Read samplesheet and provide necessary variables ----------------# +#-------------------------------------------------------------------------------# + +samplesheet = pd.read_table(config['samplesheet']['file'], sep = '\t', index_col = config['samplesheet']['index']) + +if 'URL_r1' not in list(samplesheet): + samplesheet['URL_r1'] = ['{dir}/{sample}.{format}'.format(dir = config['dirs']['fastq'], sample = sample, format = 'fastq.gz') for sample in list(samplesheet.index)] + +SAMPLES = samplesheet.to_dict(orient = 'index') +SAMPLE_NAMES = sorted(SAMPLES.keys()) + +#-------------------------------------------------------------------------------# +#--------------------------- Generate output files -----------------------------# +#-------------------------------------------------------------------------------# + +output_files = [ + GENOME, + ANNOTATION +] + +#-------------------------------------------------------------------------------# +#---------------------------------- RUN :-) ------------------------------------# +#-------------------------------------------------------------------------------# + +include: "src/reference.snake" + +if config["debug"]: + print_debug() + +rule all: + input: + output_files +message: "Done."